BASELINE: Update chromium to 44.0.2403.47

Change-Id: Ie056fedba95cf5e5c76b30c4b2c80fca4764aa2f Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@theqtcompany.com>
author: Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> 2015-06-18 14:10:49 +0200
committer: Oswald Buddenhagen <oswald.buddenhagen@theqtcompany.com> 2015-06-18 13:53:24 +0000
commit: 813fbf95af77a531c57a8c497345ad2c61d475b3 (patch)
tree: 821b2c8de8365f21b6c9ba17a236fb3006a1d506 /chromium/third_party/libvpx/source/libvpx
parent: af6588f8d723931a298c995fa97259bb7f7deb55 (diff)
download: qtwebengine-chromium-813fbf95af77a531c57a8c497345ad2c61d475b3.tar.gz
458 files changed, 49796 insertions, 42047 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/.mailmap b/chromium/third_party/libvpx/source/libvpx/.mailmap
index fb82a24e345..0bfda120f98 100644
--- a/chromium/third_party/libvpx/source/libvpx/.mailmap
+++ b/chromium/third_party/libvpx/source/libvpx/.mailmap
@@ -1,18 +1,26 @@
 Adrian Grange <agrange@google.com>
+Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Jim Bankoski <jimbankoski@google.com>
-John Koleszar <jkoleszar@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Sami Pietilä <samipietila@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
-Ralph Giles <giles@xiph.org> <giles@entropywave.com>
-Ralph Giles <giles@xiph.org> <giles@mozilla.com>
-Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Deb Mukherjee <debargha@google.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
diff --git a/chromium/third_party/libvpx/source/libvpx/AUTHORS b/chromium/third_party/libvpx/source/libvpx/AUTHORS
index a9aa4810634..2f63d7c5afb 100644
--- a/chromium/third_party/libvpx/source/libvpx/AUTHORS
+++ b/chromium/third_party/libvpx/source/libvpx/AUTHORS
@@ -3,10 +3,11 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <alex.converse@gmail.com>
+Alex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -14,44 +15,58 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
 Christian Duvivier <cduvivier@google.com>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
-Erik Niemeyer <erik.a.niemeyer@gmail.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+JackyChen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
+John Stark <jhnstrk@gmail.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
@@ -65,6 +80,7 @@ Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -72,6 +88,8 @@ Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
@@ -79,22 +97,29 @@ Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rbultje@google.com>
+Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
+Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
 Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation
diff --git a/chromium/third_party/libvpx/source/libvpx/CHANGELOG b/chromium/third_party/libvpx/source/libvpx/CHANGELOG
index 97c9a7bd325..a318784150a 100644
--- a/chromium/third_party/libvpx/source/libvpx/CHANGELOG
+++ b/chromium/third_party/libvpx/source/libvpx/CHANGELOG
@@ -1,3 +1,26 @@
+2015-04-03 v1.4.0 "Indian Runner Duck"
+  This release includes significant improvements to the VP9 codec.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.3.0. It drops the compatibility
+    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
+    controls for VP9.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Multithreaded VP9 decoding (tile and frame-based)
+    Multithreaded VP9 encoding - on by default
+    YUV 4:2:2 and 4:4:4 support in VP9
+    10 and 12bit support in VP9
+    64bit ARM support by replacing ARM assembly with intrinsics
+
+  - Bug Fixes:
+    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
+    files.
+
+  - Known Issues:
+    Frame Parallel decoding fails for segmented and non-420 files.
+
 2013-11-15 v1.3.0 "Forest"
   This release introduces the VP9 codec in a backward-compatible way.
   All existing users of VP8 can continue to use the library without
diff --git a/chromium/third_party/libvpx/source/libvpx/PATENTS b/chromium/third_party/libvpx/source/libvpx/PATENTS
index 79d17d7d6a9..caedf607e95 100644
--- a/chromium/third_party/libvpx/source/libvpx/PATENTS
+++ b/chromium/third_party/libvpx/source/libvpx/PATENTS
@@ -17,7 +17,7 @@ or agree to the institution of patent litigation or any other patent
 enforcement activity against any entity (including a cross-claim or
 counterclaim in a lawsuit) alleging that any of these implementations of WebM
 or any code incorporated within any of these implementations of WebM
-constitutes direct or contributory patent infringement, or inducement of
+constitute direct or contributory patent infringement, or inducement of
 patent infringement, then any patent rights granted to you under this License
 for these implementations of WebM shall terminate as of the date such
 litigation is filed.
diff --git a/chromium/third_party/libvpx/source/libvpx/README b/chromium/third_party/libvpx/source/libvpx/README
index 6f864d8591a..fcd1c2e18cf 100644
--- a/chromium/third_party/libvpx/source/libvpx/README
+++ b/chromium/third_party/libvpx/source/libvpx/README
@@ -1,4 +1,4 @@
-README - 30 May 2014
+README - 23 March 2015
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -47,10 +47,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   --help output of the configure script. As of this writing, the list of
   available targets is:
 
-    armv5te-android-gcc
-    armv5te-linux-rvct
-    armv5te-linux-gcc
-    armv5te-none-rvct
     armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
@@ -66,12 +62,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv7s-darwin-gcc
     mips32-linux-gcc
     mips64-linux-gcc
-    ppc32-darwin8-gcc
-    ppc32-darwin9-gcc
-    ppc32-linux-gcc
-    ppc64-darwin8-gcc
-    ppc64-darwin9-gcc
-    ppc64-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
     x86-darwin8-gcc
@@ -82,6 +72,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-darwin11-gcc
     x86-darwin12-gcc
     x86-darwin13-gcc
+    x86-darwin14-gcc
     x86-iphonesimulator-gcc
     x86-linux-gcc
     x86-linux-icc
@@ -99,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin11-gcc
     x86_64-darwin12-gcc
     x86_64-darwin13-gcc
+    x86_64-darwin14-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
@@ -115,6 +107,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     universal-darwin11-gcc
     universal-darwin12-gcc
     universal-darwin13-gcc
+    universal-darwin14-gcc
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/chromium/third_party/libvpx/source/libvpx/build/arm-msvs/obj_int_extract.bat b/chromium/third_party/libvpx/source/libvpx/build/arm-msvs/obj_int_extract.bat
deleted file mode 100644
index c0987bcf7bf..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/build/arm-msvs/obj_int_extract.bat
+++ /dev/null
@@ -1,18 +0,0 @@
-REM   Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-REM
-REM   Use of this source code is governed by a BSD-style license
-REM   that can be found in the LICENSE file in the root of the source
-REM   tree. An additional intellectual property rights grant can be found
-REM   in the file PATENTS.  All contributing project authors may
-REM   be found in the AUTHORS file in the root of the source tree.
-echo on
-
-REM Arguments:
-REM   %1 - Relative path to the directory containing the vp8 and vpx_scale
-REM        source directories.
-REM   %2 - Path to obj_int_extract.exe.
-cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
-%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
-
-cl /I. /I%1 /nologo /c /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP "%~1/vpx_scale/vpx_scale_asm_offsets.c"
-%2\obj_int_extract.exe rvds "vpx_scale_asm_offsets.obj" > "vpx_scale_asm_offsets.asm"
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
index 816334e040d..0add523f99d 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk
@@ -43,7 +43,7 @@
 # will remove any NEON dependency.
 
 # To change to building armeabi, run ./libvpx/configure again, but with
-# --target=arm5te-android-gcc and modify the Application.mk file to
+# --target=armv6-android-gcc and modify the Application.mk file to
 # set APP_ABI := armeabi
 #
 # Running ndk-build will build libvpx and include it in your project.
@@ -60,7 +60,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
   include $(CONFIG_DIR)libs-armv7-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq  ($(TARGET_ARCH_ABI),armeabi)
-  include $(CONFIG_DIR)libs-armv5te-android-gcc.mk
+  include $(CONFIG_DIR)libs-armv6-android-gcc.mk
   LOCAL_ARM_MODE := arm
 else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a)
   include $(CONFIG_DIR)libs-armv8-android-gcc.mk
@@ -91,51 +91,8 @@ LOCAL_CFLAGS := -O3
 # like x86inc.asm and x86_abi_support.asm
 LOCAL_ASMFLAGS := -I$(LIBVPX_PATH)
 
-# -----------------------------------------------------------------------------
-# Template  : asm_offsets_template
-# Arguments : 1: assembly offsets file to be created
-#             2: c file to base assembly offsets on
-# Returns   : None
-# Usage     : $(eval $(call asm_offsets_template,<asmfile>, <srcfile>
-# Rationale : Create offsets at compile time using for structures that are
-#             defined in c, but used in assembly functions.
-# -----------------------------------------------------------------------------
-define asm_offsets_template
-
-_SRC:=$(2)
-_OBJ:=$(ASM_CNV_PATH)/$$(notdir $(2)).S
-
-_FLAGS = $$($$(my)CFLAGS) \
-          $$(call get-src-file-target-cflags,$(2)) \
-          $$(call host-c-includes,$$(LOCAL_C_INCLUDES) $$(CONFIG_DIR)) \
-          $$(LOCAL_CFLAGS) \
-          $$(NDK_APP_CFLAGS) \
-          $$(call host-c-includes,$$($(my)C_INCLUDES)) \
-          -DINLINE_ASM \
-          -S \
-
-_TEXT = "Compile $$(call get-src-file-text,$(2))"
-_CC   = $$(TARGET_CC)
-
-$$(eval $$(call ev-build-file))
-
-$(1) : $$(_OBJ) $(2)
-	@mkdir -p $$(dir $$@)
-	@grep $(OFFSET_PATTERN) $$< | tr -d '\#' | $(CONFIG_DIR)$(ASM_CONVERSION) > $$@
-endef
-
-# Use ads2gas script to convert from RVCT format to GAS format.  This
-#  puts the processed file under $(ASM_CNV_PATH).  Local clean rule
-#  to handle removing these
-ifeq ($(CONFIG_VP8_ENCODER), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm
-endif
-ifeq ($(HAVE_NEON_ASM), yes)
-  ASM_CNV_OFFSETS_DEPEND += $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm
-endif
-
 .PRECIOUS: %.asm.s
-$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm $(ASM_CNV_OFFSETS_DEPEND)
+$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm
 	@mkdir -p $(dir $@)
 	@$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@
 
@@ -201,8 +158,6 @@ LOCAL_CFLAGS += \
 
 LOCAL_MODULE := libvpx
 
-LOCAL_LDLIBS := -llog
-
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures
 endif
@@ -215,6 +170,7 @@ ifeq ($(CONFIG_VP9), yes)
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp9_rtcd.h
 endif
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_scale_rtcd.h
+$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_dsp_rtcd.h
 
 ifeq ($(TARGET_ARCH_ABI),x86)
 $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm
@@ -224,22 +180,13 @@ endif
 clean:
 	@echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]"
 	@$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS)
-	@$(RM) $(patsubst %.asm, %.*, $(ASM_CNV_OFFSETS_DEPEND))
 	@$(RM) -r $(ASM_CNV_PATH)
 	@$(RM) $(CLEAN-OBJS)
 
-include $(BUILD_SHARED_LIBRARY)
-
-ifeq ($(HAVE_NEON), yes)
-  $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/vpx_scale_asm_offsets.asm, \
-    $(LIBVPX_PATH)/vpx_scale/vpx_scale_asm_offsets.c))
-endif
-
-ifeq ($(CONFIG_VP8_ENCODER), yes)
-  $(eval $(call asm_offsets_template,\
-    $(ASM_CNV_PATH)/vp8_asm_enc_offsets.asm, \
-    $(LIBVPX_PATH)/vp8/encoder/vp8_asm_enc_offsets.c))
+ifeq ($(ENABLE_SHARED),1)
+  include $(BUILD_SHARED_LIBRARY)
+else
+  include $(BUILD_STATIC_LIBRARY)
 endif
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
index ed90397f0ff..fc7749a5519 100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile
@@ -146,6 +146,7 @@ $(BUILD_PFX)%.c.d: %.c
 
 $(BUILD_PFX)%.c.o: %.c
 	$(if $(quiet),@echo "    [CC] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $<
 
 $(BUILD_PFX)%.cc.d: %.cc
@@ -155,6 +156,7 @@ $(BUILD_PFX)%.cc.d: %.cc
 
 $(BUILD_PFX)%.cc.o: %.cc
 	$(if $(quiet),@echo "    [CXX] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
 
 $(BUILD_PFX)%.cpp.d: %.cpp
@@ -164,6 +166,7 @@ $(BUILD_PFX)%.cpp.d: %.cpp
 
 $(BUILD_PFX)%.cpp.o: %.cpp
 	$(if $(quiet),@echo "    [CXX] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CXX) $(INTERNAL_CFLAGS) $(CXXFLAGS) -c -o $@ $<
 
 $(BUILD_PFX)%.asm.d: %.asm
@@ -174,6 +177,7 @@ $(BUILD_PFX)%.asm.d: %.asm
 
 $(BUILD_PFX)%.asm.o: %.asm
 	$(if $(quiet),@echo "    [AS] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
 
 $(BUILD_PFX)%.s.d: %.s
@@ -184,12 +188,14 @@ $(BUILD_PFX)%.s.d: %.s
 
 $(BUILD_PFX)%.s.o: %.s
 	$(if $(quiet),@echo "    [AS] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(AS) $(ASFLAGS) -o $@ $<
 
 .PRECIOUS: %.c.S
 %.c.S: CFLAGS += -DINLINE_ASM
 $(BUILD_PFX)%.c.S: %.c
 	$(if $(quiet),@echo "    [GEN] $@")
+	$(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@))
 	$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
 
 .PRECIOUS: %.asm.s
@@ -217,14 +223,6 @@ else
 endif
 
 #
-# Rule to extract assembly constants from C sources
-#
-obj_int_extract: build/make/obj_int_extract.c
-	$(if $(quiet),@echo "    [HOSTCC] $@")
-	$(qexec)$(HOSTCC) -I. -I$(SRC_PATH_BARE) -o $@ $<
-CLEAN-OBJS += obj_int_extract
-
-#
 # Utility functions
 #
 pairmap=$(if $(strip $(2)),\
@@ -340,9 +338,11 @@ endif
 skip_deps := $(filter %clean,$(MAKECMDGOALS))
 skip_deps += $(findstring testdata,$(MAKECMDGOALS))
 ifeq ($(strip $(skip_deps)),)
-  # Older versions of make don't like -include directives with no arguments
-  ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
-    -include $(filter %.d,$(OBJS-yes:.o=.d))
+  ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
+    # Older versions of make don't like -include directives with no arguments
+    ifneq ($(filter %.d,$(OBJS-yes:.o=.d)),)
+      -include $(filter %.d,$(OBJS-yes:.o=.d))
+    endif
   endif
 endif
 
@@ -383,8 +383,8 @@ LIBS=$(call enabled,LIBS)
 .libs: $(LIBS)
 	@touch $@
 $(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
-$(foreach lib,$(filter %so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
-$(foreach lib,$(filter %$(VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
 
 INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
 ifeq ($(MAKECMDGOALS),dist)
@@ -424,11 +424,7 @@ ifneq ($(call enabled,DIST-SRCS),)
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
-    DIST-SRCS-$(CONFIG_MSVS)  += build/x86-msvs/obj_int_extract.bat
-    DIST-SRCS-$(CONFIG_MSVS)  += build/arm-msvs/obj_int_extract.bat
     DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
-    # Include obj_int_extract if we use offsets from *_asm_*_offsets
-    DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64)    += build/make/obj_int_extract.c
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas.pl
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2gas_apple.pl
     DIST-SRCS-$(ARCH_ARM)    += build/make/ads2armasm_ms.pl
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
index 56e9f4406e8..68cc8bb4a5d 100755..100644
--- a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh
@@ -14,62 +14,56 @@
 # Logging / Output Functions
 #
 die_unknown(){
-    echo "Unknown option \"$1\"."
-    echo "See $0 --help for available options."
-    clean_temp_files
-    exit 1
+  echo "Unknown option \"$1\"."
+  echo "See $0 --help for available options."
+  clean_temp_files
+  exit 1
 }
 
-
 die() {
-    echo "$@"
-    echo
-    echo "Configuration failed. This could reflect a misconfiguration of your"
-    echo "toolchains, improper options selected, or another problem. If you"
-    echo "don't see any useful error messages above, the next step is to look"
-    echo "at the configure error log file ($logfile) to determine what"
-    echo "configure was trying to do when it died."
-    clean_temp_files
-    exit 1
+  echo "$@"
+  echo
+  echo "Configuration failed. This could reflect a misconfiguration of your"
+  echo "toolchains, improper options selected, or another problem. If you"
+  echo "don't see any useful error messages above, the next step is to look"
+  echo "at the configure error log file ($logfile) to determine what"
+  echo "configure was trying to do when it died."
+  clean_temp_files
+  exit 1
 }
 
-
 log(){
-    echo "$@" >>$logfile
+  echo "$@" >>$logfile
 }
 
-
 log_file(){
-    log BEGIN $1
-    cat -n $1 >>$logfile
-    log END $1
+  log BEGIN $1
+  cat -n $1 >>$logfile
+  log END $1
 }
 
-
 log_echo() {
-    echo "$@"
-    log "$@"
+  echo "$@"
+  log "$@"
 }
 
-
 fwrite () {
-    outfile=$1
-    shift
-    echo "$@" >> ${outfile}
+  outfile=$1
+  shift
+  echo "$@" >> ${outfile}
 }
 
-
 show_help_pre(){
-    for opt in ${CMDLINE_SELECT}; do
-        opt2=`echo $opt | sed -e 's;_;-;g'`
-        if enabled $opt; then
-            eval "toggle_${opt}=\"--disable-${opt2}\""
-        else
-            eval "toggle_${opt}=\"--enable-${opt2} \""
-        fi
-    done
+  for opt in ${CMDLINE_SELECT}; do
+    opt2=`echo $opt | sed -e 's;_;-;g'`
+    if enabled $opt; then
+      eval "toggle_${opt}=\"--disable-${opt2}\""
+    else
+      eval "toggle_${opt}=\"--enable-${opt2} \""
+    fi
+  done
 
-    cat <<EOF
+  cat <<EOF
 Usage: configure [options]
 Options:
 
@@ -89,6 +83,8 @@ Build options:
   ${toggle_gprof}             enable/disable gprof profiling instrumentation
   ${toggle_gcov}              enable/disable gcov coverage instrumentation
   ${toggle_thumb}             enable/disable building arm assembly in thumb mode
+  ${toggle_dependency_tracking}
+                              disable to speed up one-time build
 
 Install options:
   ${toggle_install_docs}      control whether docs are installed
@@ -100,9 +96,8 @@ Install options:
 EOF
 }
 
-
 show_help_post(){
-    cat <<EOF
+  cat <<EOF
 
 
 NOTES:
@@ -119,150 +114,137 @@ EOF
   exit 1
 }
 
-
 show_targets() {
-    while [ -n "$*" ]; do
-        if [ "${1%%-*}" = "${2%%-*}" ]; then
-            if [ "${2%%-*}" = "${3%%-*}" ]; then
-                printf "    %-24s %-24s %-24s\n" "$1" "$2" "$3"
-                shift; shift; shift
-            else
-                printf "    %-24s %-24s\n" "$1" "$2"
-                shift; shift
-            fi
-        else
-            printf "    %-24s\n" "$1"
-            shift
-        fi
-    done
+  while [ -n "$*" ]; do
+    if [ "${1%%-*}" = "${2%%-*}" ]; then
+      if [ "${2%%-*}" = "${3%%-*}" ]; then
+        printf "    %-24s %-24s %-24s\n" "$1" "$2" "$3"
+        shift; shift; shift
+      else
+        printf "    %-24s %-24s\n" "$1" "$2"
+        shift; shift
+      fi
+    else
+      printf "    %-24s\n" "$1"
+      shift
+    fi
+  done
 }
 
-
 show_help() {
-    show_help_pre
-    show_help_post
+  show_help_pre
+  show_help_post
 }
 
 #
 # List Processing Functions
 #
 set_all(){
-    value=$1
-    shift
-    for var in $*; do
-        eval $var=$value
-    done
+  value=$1
+  shift
+  for var in $*; do
+    eval $var=$value
+  done
 }
 
-
 is_in(){
-    value=$1
-    shift
-    for var in $*; do
-        [ $var = $value ] && return 0
-    done
-    return 1
+  value=$1
+  shift
+  for var in $*; do
+    [ $var = $value ] && return 0
+  done
+  return 1
 }
 
-
 add_cflags() {
-    CFLAGS="${CFLAGS} $@"
-    CXXFLAGS="${CXXFLAGS} $@"
+  CFLAGS="${CFLAGS} $@"
+  CXXFLAGS="${CXXFLAGS} $@"
 }
 
-
 add_cflags_only() {
-    CFLAGS="${CFLAGS} $@"
+  CFLAGS="${CFLAGS} $@"
 }
 
-
 add_cxxflags_only() {
-    CXXFLAGS="${CXXFLAGS} $@"
+  CXXFLAGS="${CXXFLAGS} $@"
 }
 
-
 add_ldflags() {
-    LDFLAGS="${LDFLAGS} $@"
+  LDFLAGS="${LDFLAGS} $@"
 }
 
-
 add_asflags() {
-    ASFLAGS="${ASFLAGS} $@"
+  ASFLAGS="${ASFLAGS} $@"
 }
 
-
 add_extralibs() {
-    extralibs="${extralibs} $@"
+  extralibs="${extralibs} $@"
 }
 
 #
 # Boolean Manipulation Functions
 #
 enable_feature(){
-    set_all yes $*
+  set_all yes $*
 }
 
 disable_feature(){
-    set_all no $*
+  set_all no $*
 }
 
 enabled(){
-    eval test "x\$$1" = "xyes"
+  eval test "x\$$1" = "xyes"
 }
 
 disabled(){
-    eval test "x\$$1" = "xno"
+  eval test "x\$$1" = "xno"
 }
 
-
 soft_enable() {
-    for var in $*; do
-        if ! disabled $var; then
-            log_echo "  enabling $var"
-            enable_feature $var
-        fi
-    done
+  for var in $*; do
+    if ! disabled $var; then
+      enabled $var || log_echo "  enabling $var"
+      enable_feature $var
+    fi
+  done
 }
 
 soft_disable() {
-    for var in $*; do
-        if ! enabled $var; then
-            log_echo "  disabling $var"
-            disable_feature $var
-        fi
-    done
+  for var in $*; do
+    if ! enabled $var; then
+      disabled $var || log_echo "  disabling $var"
+      disable_feature $var
+    fi
+  done
 }
 
-
 #
 # Text Processing Functions
 #
 toupper(){
-    echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
+  echo "$@" | tr abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ
 }
 
-
 tolower(){
-    echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
+  echo "$@" | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
 }
 
-
 #
 # Temporary File Functions
 #
 source_path=${0%/*}
 enable_feature source_path_used
 if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
-    source_path="`pwd`"
-    disable_feature source_path_used
+  source_path="`pwd`"
+  disable_feature source_path_used
 fi
 
 if test ! -z "$TMPDIR" ; then
-    TMPDIRx="${TMPDIR}"
+  TMPDIRx="${TMPDIR}"
 elif test ! -z "$TEMPDIR" ; then
-    TMPDIRx="${TEMPDIR}"
+  TMPDIRx="${TEMPDIR}"
 else
-    TMPDIRx="/tmp"
+  TMPDIRx="/tmp"
 fi
 RAND=$(awk 'BEGIN { srand(); printf "%d\n",(rand() * 32768)}')
 TMP_H="${TMPDIRx}/vpx-conf-$$-${RAND}.h"
@@ -273,76 +255,77 @@ TMP_X="${TMPDIRx}/vpx-conf-$$-${RAND}.x"
 TMP_ASM="${TMPDIRx}/vpx-conf-$$-${RAND}.asm"
 
 clean_temp_files() {
-    rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
-    enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
+  rm -f ${TMP_C} ${TMP_CC} ${TMP_H} ${TMP_O} ${TMP_X} ${TMP_ASM}
+  enabled gcov && rm -f ${TMP_C%.c}.gcno ${TMP_CC%.cc}.gcno
 }
 
 #
 # Toolchain Check Functions
 #
 check_cmd() {
-    enabled external_build && return
-    log "$@"
-    "$@" >>${logfile} 2>&1
+  enabled external_build && return
+  log "$@"
+  "$@" >>${logfile} 2>&1
 }
 
 check_cc() {
-    log check_cc "$@"
-    cat >${TMP_C}
-    log_file ${TMP_C}
-    check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
+  log check_cc "$@"
+  cat >${TMP_C}
+  log_file ${TMP_C}
+  check_cmd ${CC} ${CFLAGS} "$@" -c -o ${TMP_O} ${TMP_C}
 }
 
 check_cxx() {
-    log check_cxx "$@"
-    cat >${TMP_CC}
-    log_file ${TMP_CC}
-    check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
+  log check_cxx "$@"
+  cat >${TMP_CC}
+  log_file ${TMP_CC}
+  check_cmd ${CXX} ${CXXFLAGS} "$@" -c -o ${TMP_O} ${TMP_CC}
 }
 
 check_cpp() {
-    log check_cpp "$@"
-    cat > ${TMP_C}
-    log_file ${TMP_C}
-    check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
+  log check_cpp "$@"
+  cat > ${TMP_C}
+  log_file ${TMP_C}
+  check_cmd ${CC} ${CFLAGS} "$@" -E -o ${TMP_O} ${TMP_C}
 }
 
 check_ld() {
-    log check_ld "$@"
-    check_cc $@ \
-        && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
+  log check_ld "$@"
+  check_cc $@ \
+    && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
 }
 
 check_header(){
-    log check_header "$@"
-    header=$1
-    shift
-    var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
-    disable_feature $var
-    check_cpp "$@" <<EOF && enable_feature $var
+  log check_header "$@"
+  header=$1
+  shift
+  var=`echo $header | sed 's/[^A-Za-z0-9_]/_/g'`
+  disable_feature $var
+  check_cpp "$@" <<EOF && enable_feature $var
 #include "$header"
 int x;
 EOF
 }
 
-
 check_cflags() {
-    log check_cflags "$@"
-    check_cc -Werror "$@" <<EOF
+ log check_cflags "$@"
+ check_cc -Werror "$@" <<EOF
 int x;
 EOF
 }
 
 check_cxxflags() {
-    log check_cxxflags "$@"
+  log check_cxxflags "$@"
 
-    # Catch CFLAGS that trigger CXX warnings
-    case "$CXX" in
-      *c++-analyzer|*clang++|*g++*) check_cxx -Werror "$@" <<EOF
+  # Catch CFLAGS that trigger CXX warnings
+  case "$CXX" in
+    *c++-analyzer|*clang++|*g++*)
+      check_cxx -Werror "$@" <<EOF
 int x;
 EOF
       ;;
-      *) check_cxx -Werror "$@" <<EOF
+    *)
+      check_cxx -Werror "$@" <<EOF
 int x;
 EOF
       ;;
@@ -350,82 +333,82 @@ EOF
 }
 
 check_add_cflags() {
-    check_cxxflags "$@" && add_cxxflags_only "$@"
-    check_cflags "$@" && add_cflags_only "$@"
+  check_cxxflags "$@" && add_cxxflags_only "$@"
+  check_cflags "$@" && add_cflags_only "$@"
 }
 
 check_add_asflags() {
-    log add_asflags "$@"
-    add_asflags "$@"
+  log add_asflags "$@"
+  add_asflags "$@"
 }
 
 check_add_ldflags() {
-    log add_ldflags "$@"
-    add_ldflags "$@"
+  log add_ldflags "$@"
+  add_ldflags "$@"
 }
 
 check_asm_align() {
-    log check_asm_align "$@"
-    cat >${TMP_ASM} <<EOF
+  log check_asm_align "$@"
+  cat >${TMP_ASM} <<EOF
 section .rodata
 align 16
 EOF
-    log_file ${TMP_ASM}
-    check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
-    readelf -WS ${TMP_O} >${TMP_X}
-    log_file ${TMP_X}
-    if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
-        die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
-    fi
+  log_file ${TMP_ASM}
+  check_cmd ${AS} ${ASFLAGS} -o ${TMP_O} ${TMP_ASM}
+  readelf -WS ${TMP_O} >${TMP_X}
+  log_file ${TMP_X}
+  if ! grep -q '\.rodata .* 16$' ${TMP_X}; then
+    die "${AS} ${ASFLAGS} does not support section alignment (nasm <=2.08?)"
+  fi
 }
 
 # tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
 check_gcc_machine_option() {
-    opt="$1"
-    feature="$2"
-    [ -n "$feature" ] || feature="$opt"
-
-    if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
-        RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
-    else
-        soft_enable "$feature"
-    fi
+  opt="$1"
+  feature="$2"
+  [ -n "$feature" ] || feature="$opt"
+
+  if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+  else
+    soft_enable "$feature"
+  fi
 }
 
 write_common_config_banner() {
-    print_webm_license config.mk "##" ""
-    echo '# This file automatically generated by configure. Do not edit!' >> config.mk
-    echo "TOOLCHAIN := ${toolchain}" >> config.mk
+  print_webm_license config.mk "##" ""
+  echo '# This file automatically generated by configure. Do not edit!' >> config.mk
+  echo "TOOLCHAIN := ${toolchain}" >> config.mk
 
-    case ${toolchain} in
-        *-linux-rvct)
-            echo "ALT_LIBC := ${alt_libc}" >> config.mk
-            ;;
-    esac
+  case ${toolchain} in
+    *-linux-rvct)
+      echo "ALT_LIBC := ${alt_libc}" >> config.mk
+      ;;
+  esac
 }
 
 write_common_config_targets() {
-    for t in ${all_targets}; do
-        if enabled ${t}; then
-            if enabled universal || enabled child; then
-                fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
-            else
-                fwrite config.mk "ALL_TARGETS += ${t}"
-            fi
-        fi
+  for t in ${all_targets}; do
+    if enabled ${t}; then
+      if enabled universal || enabled child; then
+        fwrite config.mk "ALL_TARGETS += ${t}-${toolchain}"
+      else
+        fwrite config.mk "ALL_TARGETS += ${t}"
+      fi
+    fi
     true;
-    done
-true
+  done
+  true
 }
 
 write_common_target_config_mk() {
-    saved_CC="${CC}"
-    saved_CXX="${CXX}"
-    enabled ccache && CC="ccache ${CC}"
-    enabled ccache && CXX="ccache ${CXX}"
-    print_webm_license $1 "##" ""
+  saved_CC="${CC}"
+  saved_CXX="${CXX}"
+  enabled ccache && CC="ccache ${CC}"
+  enabled ccache && CXX="ccache ${CXX}"
+  print_webm_license $1 "##" ""
 
-    cat >> $1 << EOF
+  cat >> $1 << EOF
 # This file automatically generated by configure. Do not edit!
 SRC_PATH="$source_path"
 SRC_PATH_BARE=$source_path
@@ -455,83 +438,87 @@ VCPROJ_SFX = ${VCPROJ_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
 EOF
 
-    if enabled rvct; then cat >> $1 << EOF
+  if enabled rvct; then cat >> $1 << EOF
 fmt_deps = sed -e 's;^__image.axf;\${@:.d=.o} \$@;' #hide
 EOF
-    else cat >> $1 << EOF
+  else cat >> $1 << EOF
 fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\${@:.d=.o} \$@;'
 EOF
-    fi
+  fi
 
-    print_config_mk ARCH   "${1}" ${ARCH_LIST}
-    print_config_mk HAVE   "${1}" ${HAVE_LIST}
-    print_config_mk CONFIG "${1}" ${CONFIG_LIST}
-    print_config_mk HAVE   "${1}" gnu_strip
+  print_config_mk ARCH   "${1}" ${ARCH_LIST}
+  print_config_mk HAVE   "${1}" ${HAVE_LIST}
+  print_config_mk CONFIG "${1}" ${CONFIG_LIST}
+  print_config_mk HAVE   "${1}" gnu_strip
 
-    enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
+  enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
 
-    CC="${saved_CC}"
-    CXX="${saved_CXX}"
+  CC="${saved_CC}"
+  CXX="${saved_CXX}"
 }
 
-
 write_common_target_config_h() {
-    print_webm_license ${TMP_H} "/*" " */"
-    cat >> ${TMP_H} << EOF
+  print_webm_license ${TMP_H} "/*" " */"
+  cat >> ${TMP_H} << EOF
 /* This file automatically generated by configure. Do not edit! */
 #ifndef VPX_CONFIG_H
 #define VPX_CONFIG_H
 #define RESTRICT    ${RESTRICT}
 #define INLINE      ${INLINE}
 EOF
-    print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
-    print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
-    print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
-    print_config_vars_h   "${TMP_H}" ${VAR_LIST}
-    echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
-    mkdir -p `dirname "$1"`
-    cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
+  print_config_h ARCH   "${TMP_H}" ${ARCH_LIST}
+  print_config_h HAVE   "${TMP_H}" ${HAVE_LIST}
+  print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
+  print_config_vars_h   "${TMP_H}" ${VAR_LIST}
+  echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
+  mkdir -p `dirname "$1"`
+  cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
 }
 
 process_common_cmdline() {
-    for opt in "$@"; do
-        optval="${opt#*=}"
-        case "$opt" in
-        --child) enable_feature child
+  for opt in "$@"; do
+    optval="${opt#*=}"
+    case "$opt" in
+      --child)
+        enable_feature child
         ;;
-        --log*)
+      --log*)
         logging="$optval"
         if ! disabled logging ; then
-            enabled logging || logfile="$logging"
+          enabled logging || logfile="$logging"
         else
-            logfile=/dev/null
+          logfile=/dev/null
         fi
         ;;
-        --target=*) toolchain="${toolchain:-${optval}}"
-        ;;
-        --force-target=*) toolchain="${toolchain:-${optval}}"; enable_feature force_toolchain
+      --target=*)
+        toolchain="${toolchain:-${optval}}"
         ;;
-        --cpu)
+      --force-target=*)
+        toolchain="${toolchain:-${optval}}"
+        enable_feature force_toolchain
         ;;
-        --cpu=*) tune_cpu="$optval"
+      --cpu=*)
+        tune_cpu="$optval"
         ;;
-        --extra-cflags=*)
+      --extra-cflags=*)
         extra_cflags="${optval}"
         ;;
-        --enable-?*|--disable-?*)
+      --enable-?*|--disable-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if echo "${ARCH_EXT_LIST}" | grep "^ *$option\$" >/dev/null; then
-            [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
+          [ $action = "disable" ] && RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${option} "
         elif [ $action = "disable" ] && ! disabled $option ; then
           echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
             die_unknown $opt
+          log_echo "  disabling $option"
         elif [ $action = "enable" ] && ! enabled $option ; then
           echo "${CMDLINE_SELECT}" | grep "^ *$option\$" >/dev/null ||
             die_unknown $opt
+          log_echo "  enabling $option"
         fi
         ${action}_feature $option
         ;;
-        --require-?*)
+      --require-?*)
         eval `echo "$opt" | sed 's/--/action=/;s/-/ option=/;s/-/_/g'`
         if echo "${ARCH_EXT_LIST}" none | grep "^ *$option\$" >/dev/null; then
             RTCD_OPTIONS="${RTCD_OPTIONS}${opt} "
@@ -539,22 +526,22 @@ process_common_cmdline() {
             die_unknown $opt
         fi
         ;;
-        --force-enable-?*|--force-disable-?*)
+      --force-enable-?*|--force-disable-?*)
         eval `echo "$opt" | sed 's/--force-/action=/;s/-/ option=/;s/-/_/g'`
         ${action}_feature $option
         ;;
-        --libc=*)
+      --libc=*)
         [ -d "${optval}" ] || die "Not a directory: ${optval}"
         disable_feature builtin_libc
         alt_libc="${optval}"
         ;;
-        --as=*)
+      --as=*)
         [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \
-            || [ "${optval}" = auto ] \
-            || die "Must be yasm, nasm or auto: ${optval}"
+          || [ "${optval}" = auto ] \
+          || die "Must be yasm, nasm or auto: ${optval}"
         alt_as="${optval}"
         ;;
-        --size-limit=*)
+      --size-limit=*)
         w="${optval%%x*}"
         h="${optval##*x}"
         VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}"
@@ -563,825 +550,832 @@ process_common_cmdline() {
             || die "Invalid size-limit: too big."
         enable_feature size_limit
         ;;
-        --prefix=*)
+      --prefix=*)
         prefix="${optval}"
         ;;
-        --libdir=*)
+      --libdir=*)
         libdir="${optval}"
         ;;
-        --sdk-path=*)
+      --sdk-path=*)
         [ -d "${optval}" ] || die "Not a directory: ${optval}"
         sdk_path="${optval}"
         ;;
-        --libc|--as|--prefix|--libdir|--sdk-path)
+      --libc|--as|--prefix|--libdir|--sdk-path)
         die "Option ${opt} requires argument"
         ;;
-        --help|-h) show_help
+      --help|-h)
+        show_help
         ;;
-        *) die_unknown $opt
+      *)
+        die_unknown $opt
         ;;
-        esac
-    done
+    esac
+  done
 }
 
 process_cmdline() {
-    for opt do
-        optval="${opt#*=}"
-        case "$opt" in
-        *) process_common_cmdline $opt
+  for opt do
+    optval="${opt#*=}"
+    case "$opt" in
+      *)
+        process_common_cmdline $opt
         ;;
-        esac
-    done
+    esac
+  done
 }
 
-
 post_process_common_cmdline() {
-    prefix="${prefix:-/usr/local}"
-    prefix="${prefix%/}"
-    libdir="${libdir:-${prefix}/lib}"
-    libdir="${libdir%/}"
-    if [ "${libdir#${prefix}}" = "${libdir}" ]; then
-        die "Libdir ${libdir} must be a subdirectory of ${prefix}"
-    fi
+  prefix="${prefix:-/usr/local}"
+  prefix="${prefix%/}"
+  libdir="${libdir:-${prefix}/lib}"
+  libdir="${libdir%/}"
+  if [ "${libdir#${prefix}}" = "${libdir}" ]; then
+    die "Libdir ${libdir} must be a subdirectory of ${prefix}"
+  fi
 }
 
-
 post_process_cmdline() {
-    true;
+  true;
 }
 
 setup_gnu_toolchain() {
-        CC=${CC:-${CROSS}gcc}
-        CXX=${CXX:-${CROSS}g++}
-        AR=${AR:-${CROSS}ar}
-        LD=${LD:-${CROSS}${link_with_cc:-ld}}
-        AS=${AS:-${CROSS}as}
-    STRIP=${STRIP:-${CROSS}strip}
-    NM=${NM:-${CROSS}nm}
-        AS_SFX=.s
-        EXE_SFX=
+  CC=${CC:-${CROSS}gcc}
+  CXX=${CXX:-${CROSS}g++}
+  AR=${AR:-${CROSS}ar}
+  LD=${LD:-${CROSS}${link_with_cc:-ld}}
+  AS=${AS:-${CROSS}as}
+  STRIP=${STRIP:-${CROSS}strip}
+  NM=${NM:-${CROSS}nm}
+  AS_SFX=.s
+  EXE_SFX=
+}
+
+# Reliably find the newest available Darwin SDKs. (Older versions of
+# xcrun don't support --show-sdk-path.)
+show_darwin_sdk_path() {
+  xcrun --sdk $1 --show-sdk-path 2>/dev/null ||
+    xcodebuild -sdk $1 -version Path 2>/dev/null
 }
 
 process_common_toolchain() {
-    if [ -z "$toolchain" ]; then
-        gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
-
-        # detect tgt_isa
-        case "$gcctarget" in
-            armv6*)
-                tgt_isa=armv6
-                ;;
-            armv7*-hardfloat*)
-                tgt_isa=armv7
-                float_abi=hard
-                ;;
-            armv7*)
-                tgt_isa=armv7
-                float_abi=softfp
-                ;;
-            armv5te*)
-                tgt_isa=armv5te
-                ;;
-            *x86_64*|*amd64*)
-                tgt_isa=x86_64
-                ;;
-            *i[3456]86*)
-                tgt_isa=x86
-                ;;
-            *powerpc64*)
-                tgt_isa=ppc64
-                ;;
-            *powerpc*)
-                tgt_isa=ppc32
-                ;;
-            *sparc*)
-                tgt_isa=sparc
-                ;;
-        esac
+  if [ -z "$toolchain" ]; then
+    gcctarget="${CHOST:-$(gcc -dumpmachine 2> /dev/null)}"
 
-        # detect tgt_os
-        case "$gcctarget" in
-            *darwin8*)
-                tgt_isa=universal
-                tgt_os=darwin8
-                ;;
-            *darwin9*)
-                tgt_isa=universal
-                tgt_os=darwin9
-                ;;
-            *darwin10*)
-                tgt_isa=x86_64
-                tgt_os=darwin10
-                ;;
-            *darwin11*)
-                tgt_isa=x86_64
-                tgt_os=darwin11
-                ;;
-            *darwin12*)
-                tgt_isa=x86_64
-                tgt_os=darwin12
-                ;;
-            *darwin13*)
-                tgt_isa=x86_64
-                tgt_os=darwin13
-                ;;
-            x86_64*mingw32*)
-                tgt_os=win64
-                ;;
-            *mingw32*|*cygwin*)
-                [ -z "$tgt_isa" ] && tgt_isa=x86
-                tgt_os=win32
-                ;;
-            *linux*|*bsd*)
-                tgt_os=linux
-                ;;
-            *solaris2.10)
-                tgt_os=solaris
-                ;;
-            *os2*)
-                tgt_os=os2
-                ;;
-        esac
+    # detect tgt_isa
+    case "$gcctarget" in
+      armv6*)
+        tgt_isa=armv6
+        ;;
+      armv7*-hardfloat*)
+        tgt_isa=armv7
+        float_abi=hard
+        ;;
+      armv7*)
+        tgt_isa=armv7
+        float_abi=softfp
+        ;;
+      *x86_64*|*amd64*)
+        tgt_isa=x86_64
+        ;;
+      *i[3456]86*)
+        tgt_isa=x86
+        ;;
+      *sparc*)
+        tgt_isa=sparc
+        ;;
+    esac
 
-        if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
-            toolchain=${tgt_isa}-${tgt_os}-gcc
-        fi
-    fi
+    # detect tgt_os
+    case "$gcctarget" in
+      *darwin8*)
+        tgt_isa=universal
+        tgt_os=darwin8
+        ;;
+      *darwin9*)
+        tgt_isa=universal
+        tgt_os=darwin9
+        ;;
+      *darwin10*)
+        tgt_isa=x86_64
+        tgt_os=darwin10
+        ;;
+      *darwin11*)
+        tgt_isa=x86_64
+        tgt_os=darwin11
+        ;;
+      *darwin12*)
+        tgt_isa=x86_64
+        tgt_os=darwin12
+        ;;
+      *darwin13*)
+        tgt_isa=x86_64
+        tgt_os=darwin13
+        ;;
+      *darwin14*)
+        tgt_isa=x86_64
+        tgt_os=darwin14
+        ;;
+      x86_64*mingw32*)
+        tgt_os=win64
+        ;;
+      *mingw32*|*cygwin*)
+        [ -z "$tgt_isa" ] && tgt_isa=x86
+        tgt_os=win32
+        ;;
+      *linux*|*bsd*)
+        tgt_os=linux
+        ;;
+      *solaris2.10)
+        tgt_os=solaris
+        ;;
+      *os2*)
+        tgt_os=os2
+        ;;
+    esac
 
-    toolchain=${toolchain:-generic-gnu}
+    if [ -n "$tgt_isa" ] && [ -n "$tgt_os" ]; then
+      toolchain=${tgt_isa}-${tgt_os}-gcc
+    fi
+  fi
 
-    is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
-        || die "Unrecognized toolchain '${toolchain}'"
+  toolchain=${toolchain:-generic-gnu}
 
-    enabled child || log_echo "Configuring for target '${toolchain}'"
+  is_in ${toolchain} ${all_platforms} || enabled force_toolchain \
+    || die "Unrecognized toolchain '${toolchain}'"
 
-    #
-    # Set up toolchain variables
-    #
-    tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
-    tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
-    tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
+  enabled child || log_echo "Configuring for target '${toolchain}'"
 
-    # Mark the specific ISA requested as enabled
-    soft_enable ${tgt_isa}
-    enable_feature ${tgt_os}
-    enable_feature ${tgt_cc}
+  #
+  # Set up toolchain variables
+  #
+  tgt_isa=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $1}')
+  tgt_os=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $2}')
+  tgt_cc=$(echo ${toolchain} | awk 'BEGIN{FS="-"}{print $3}')
 
-    # Enable the architecture family
-    case ${tgt_isa} in
-        arm*) enable_feature arm;;
-        mips*) enable_feature mips;;
-    esac
+  # Mark the specific ISA requested as enabled
+  soft_enable ${tgt_isa}
+  enable_feature ${tgt_os}
+  enable_feature ${tgt_cc}
 
-    # PIC is probably what we want when building shared libs
-    enabled shared && soft_enable pic
+  # Enable the architecture family
+  case ${tgt_isa} in
+    arm*)
+      enable_feature arm
+      ;;
+    mips*)
+      enable_feature mips
+      ;;
+  esac
 
-    # Minimum iOS version for all target platforms (darwin and iphonesimulator).
-    IOS_VERSION_MIN="6.0"
+  # PIC is probably what we want when building shared libs
+  enabled shared && soft_enable pic
 
-    # Handle darwin variants. Newer SDKs allow targeting older
-    # platforms, so find the newest SDK available.
-    case ${toolchain} in
-        *-darwin*)
-            if [ -z "${DEVELOPER_DIR}" ]; then
-                DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null`
-                [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1
-            fi
-            if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then
-                OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs"
-                OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk"
-                OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk"
-                for v in ${OSX_SDK_VERSIONS}; do
-                    if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then
-                        osx_sdk_dir="${OSX_SDK_ROOTS}/${v}"
-                    fi
-                done
-            fi
-            ;;
-    esac
+  # Minimum iOS version for all target platforms (darwin and iphonesimulator).
+  IOS_VERSION_MIN="6.0"
 
-    if [ -d "${osx_sdk_dir}" ]; then
+  # Handle darwin variants. Newer SDKs allow targeting older
+  # platforms, so use the newest one available.
+  case ${toolchain} in
+    *-darwin*)
+      osx_sdk_dir="$(show_darwin_sdk_path macosx)"
+      if [ -d "${osx_sdk_dir}" ]; then
         add_cflags  "-isysroot ${osx_sdk_dir}"
         add_ldflags "-isysroot ${osx_sdk_dir}"
-    fi
+      fi
+      ;;
+  esac
 
-    case ${toolchain} in
-        *-darwin8-*)
-            add_cflags  "-mmacosx-version-min=10.4"
-            add_ldflags "-mmacosx-version-min=10.4"
-            ;;
-        *-darwin9-*)
-            add_cflags  "-mmacosx-version-min=10.5"
-            add_ldflags "-mmacosx-version-min=10.5"
-            ;;
-        *-darwin10-*)
-            add_cflags  "-mmacosx-version-min=10.6"
-            add_ldflags "-mmacosx-version-min=10.6"
-            ;;
-        *-darwin11-*)
-            add_cflags  "-mmacosx-version-min=10.7"
-            add_ldflags "-mmacosx-version-min=10.7"
-            ;;
-        *-darwin12-*)
-            add_cflags  "-mmacosx-version-min=10.8"
-            add_ldflags "-mmacosx-version-min=10.8"
-            ;;
-        *-darwin13-*)
-            add_cflags  "-mmacosx-version-min=10.9"
-            add_ldflags "-mmacosx-version-min=10.9"
-            ;;
-        *-iphonesimulator-*)
-            add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
-            add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
-            osx_sdk_dir="$(xcrun --sdk iphonesimulator --show-sdk-path)"
-            add_cflags  "-isysroot ${osx_sdk_dir}"
-            add_ldflags "-isysroot ${osx_sdk_dir}"
-            ;;
-    esac
+  case ${toolchain} in
+    *-darwin8-*)
+      add_cflags  "-mmacosx-version-min=10.4"
+      add_ldflags "-mmacosx-version-min=10.4"
+      ;;
+    *-darwin9-*)
+      add_cflags  "-mmacosx-version-min=10.5"
+      add_ldflags "-mmacosx-version-min=10.5"
+      ;;
+    *-darwin10-*)
+      add_cflags  "-mmacosx-version-min=10.6"
+      add_ldflags "-mmacosx-version-min=10.6"
+      ;;
+    *-darwin11-*)
+      add_cflags  "-mmacosx-version-min=10.7"
+      add_ldflags "-mmacosx-version-min=10.7"
+      ;;
+    *-darwin12-*)
+      add_cflags  "-mmacosx-version-min=10.8"
+      add_ldflags "-mmacosx-version-min=10.8"
+      ;;
+    *-darwin13-*)
+      add_cflags  "-mmacosx-version-min=10.9"
+      add_ldflags "-mmacosx-version-min=10.9"
+      ;;
+    *-darwin14-*)
+      add_cflags  "-mmacosx-version-min=10.10"
+      add_ldflags "-mmacosx-version-min=10.10"
+      ;;
+    *-iphonesimulator-*)
+      add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      add_ldflags "-miphoneos-version-min=${IOS_VERSION_MIN}"
+      iossim_sdk_dir="$(show_darwin_sdk_path iphonesimulator)"
+      if [ -d "${iossim_sdk_dir}" ]; then
+        add_cflags  "-isysroot ${iossim_sdk_dir}"
+        add_ldflags "-isysroot ${iossim_sdk_dir}"
+      fi
+      ;;
+  esac
 
-    # Handle Solaris variants. Solaris 10 needs -lposix4
-    case ${toolchain} in
-        sparc-solaris-*)
-            add_extralibs -lposix4
-            disable_feature fast_unaligned
-            ;;
-        *-solaris-*)
-            add_extralibs -lposix4
-            ;;
-    esac
+  # Handle Solaris variants. Solaris 10 needs -lposix4
+  case ${toolchain} in
+    sparc-solaris-*)
+      add_extralibs -lposix4
+      ;;
+    *-solaris-*)
+      add_extralibs -lposix4
+      ;;
+  esac
 
-    # Process ARM architecture variants
-    case ${toolchain} in
+  # Process ARM architecture variants
+  case ${toolchain} in
     arm*)
-        # on arm, isa versions are supersets
-        case ${tgt_isa} in
+      # on arm, isa versions are supersets
+      case ${tgt_isa} in
         arm64|armv8)
-            soft_enable neon
-            ;;
+          soft_enable neon
+          ;;
         armv7|armv7s)
-            soft_enable neon
-            soft_enable neon_asm
-            soft_enable media
-            soft_enable edsp
-            soft_enable fast_unaligned
-            ;;
+          soft_enable neon
+          # Only enable neon_asm when neon is also enabled.
+          enabled neon && soft_enable neon_asm
+          # If someone tries to force it through, die.
+          if disabled neon && enabled neon_asm; then
+            die "Disabling neon while keeping neon-asm is not supported"
+          fi
+          soft_enable media
+          ;;
         armv6)
-            soft_enable media
-            soft_enable edsp
-            soft_enable fast_unaligned
-            ;;
-        armv5te)
-            soft_enable edsp
-            disable_feature fast_unaligned
-            ;;
-        esac
+          soft_enable media
+          ;;
+      esac
 
-        asm_conversion_cmd="cat"
+      asm_conversion_cmd="cat"
 
-        case ${tgt_cc} in
+      case ${tgt_cc} in
         gcc)
-            CROSS=${CROSS:-arm-none-linux-gnueabi-}
-            link_with_cc=gcc
-            setup_gnu_toolchain
-            arch_int=${tgt_isa##armv}
-            arch_int=${arch_int%%te}
-            check_add_asflags --defsym ARCHITECTURE=${arch_int}
-            tune_cflags="-mtune="
-            if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
-                if [ -z "${float_abi}" ]; then
-                    check_cpp <<EOF && float_abi=hard || float_abi=softfp
+          CROSS=${CROSS:-arm-none-linux-gnueabi-}
+          link_with_cc=gcc
+          setup_gnu_toolchain
+          arch_int=${tgt_isa##armv}
+          arch_int=${arch_int%%te}
+          check_add_asflags --defsym ARCHITECTURE=${arch_int}
+          tune_cflags="-mtune="
+          if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
+            if [ -z "${float_abi}" ]; then
+              check_cpp <<EOF && float_abi=hard || float_abi=softfp
 #ifndef __ARM_PCS_VFP
 #error "not hardfp"
 #endif
 EOF
-                fi
-                check_add_cflags  -march=armv7-a -mfloat-abi=${float_abi}
-                check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
-
-                if enabled neon || enabled neon_asm
-                then
-                    check_add_cflags -mfpu=neon #-ftree-vectorize
-                    check_add_asflags -mfpu=neon
-                fi
-
-                if [ -z "${tune_cpu}" ]; then
-                    tune_cpu=cortex-a8
-                fi
-            else
-                check_add_cflags -march=${tgt_isa}
-                check_add_asflags -march=${tgt_isa}
             fi
+            check_add_cflags  -march=armv7-a -mfloat-abi=${float_abi}
+            check_add_asflags -march=armv7-a -mfloat-abi=${float_abi}
 
-            enabled debug && add_asflags -g
-            asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
-            if enabled thumb; then
-                asm_conversion_cmd="$asm_conversion_cmd -thumb"
-                check_add_cflags -mthumb
-                check_add_asflags -mthumb -mimplicit-it=always
+            if enabled neon || enabled neon_asm; then
+              check_add_cflags -mfpu=neon #-ftree-vectorize
+              check_add_asflags -mfpu=neon
             fi
-            ;;
+          else
+            check_add_cflags -march=${tgt_isa}
+            check_add_asflags -march=${tgt_isa}
+          fi
+
+          enabled debug && add_asflags -g
+          asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+          if enabled thumb; then
+            asm_conversion_cmd="$asm_conversion_cmd -thumb"
+            check_add_cflags -mthumb
+            check_add_asflags -mthumb -mimplicit-it=always
+          fi
+          ;;
         vs*)
-            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-            AS_SFX=.s
-            msvs_arch_dir=arm-msvs
-            disable_feature multithread
-            disable_feature unit_tests
-            vs_version=${tgt_cc##vs}
-            if [ $vs_version -ge 12 ]; then
-                # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
-                # only "AppContainerApplication" which requires an AppxManifest.
-                # Therefore disable the examples, just build the library.
-                disable_feature examples
-            fi
-            ;;
+          asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
+          AS_SFX=.s
+          msvs_arch_dir=arm-msvs
+          disable_feature multithread
+          disable_feature unit_tests
+          vs_version=${tgt_cc##vs}
+          if [ $vs_version -ge 12 ]; then
+            # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
+            # only "AppContainerApplication" which requires an AppxManifest.
+            # Therefore disable the examples, just build the library.
+            disable_feature examples
+          fi
+          ;;
         rvct)
-            CC=armcc
-            AR=armar
-            AS=armasm
-            LD="${source_path}/build/make/armlink_adapter.sh"
-            STRIP=arm-none-linux-gnueabi-strip
-            NM=arm-none-linux-gnueabi-nm
-            tune_cflags="--cpu="
-            tune_asflags="--cpu="
-            if [ -z "${tune_cpu}" ]; then
-                if [ ${tgt_isa} = "armv7" ]; then
-                    if enabled neon || enabled neon_asm
-                    then
-                        check_add_cflags --fpu=softvfp+vfpv3
-                        check_add_asflags --fpu=softvfp+vfpv3
-                    fi
-                    check_add_cflags --cpu=Cortex-A8
-                    check_add_asflags --cpu=Cortex-A8
-                else
-                    check_add_cflags --cpu=${tgt_isa##armv}
-                    check_add_asflags --cpu=${tgt_isa##armv}
-                fi
+          CC=armcc
+          AR=armar
+          AS=armasm
+          LD="${source_path}/build/make/armlink_adapter.sh"
+          STRIP=arm-none-linux-gnueabi-strip
+          NM=arm-none-linux-gnueabi-nm
+          tune_cflags="--cpu="
+          tune_asflags="--cpu="
+          if [ -z "${tune_cpu}" ]; then
+            if [ ${tgt_isa} = "armv7" ]; then
+              if enabled neon || enabled neon_asm
+              then
+                check_add_cflags --fpu=softvfp+vfpv3
+                check_add_asflags --fpu=softvfp+vfpv3
+              fi
+              check_add_cflags --cpu=Cortex-A8
+              check_add_asflags --cpu=Cortex-A8
+            else
+              check_add_cflags --cpu=${tgt_isa##armv}
+              check_add_asflags --cpu=${tgt_isa##armv}
             fi
-            arch_int=${tgt_isa##armv}
-            arch_int=${arch_int%%te}
-            check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
-            enabled debug && add_asflags -g
-            add_cflags --gnu
-            add_cflags --enum_is_int
-            add_cflags --wchar32
-        ;;
-        esac
+          fi
+          arch_int=${tgt_isa##armv}
+          arch_int=${arch_int%%te}
+          check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
+          enabled debug && add_asflags -g
+          add_cflags --gnu
+          add_cflags --enum_is_int
+          add_cflags --wchar32
+          ;;
+      esac
 
-        case ${tgt_os} in
+      case ${tgt_os} in
         none*)
-            disable_feature multithread
-            disable_feature os_support
-            ;;
+          disable_feature multithread
+          disable_feature os_support
+          ;;
 
         android*)
-            SDK_PATH=${sdk_path}
-            COMPILER_LOCATION=`find "${SDK_PATH}" \
-                               -name "arm-linux-androideabi-gcc*" -print -quit`
-            TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
-            CC=${TOOLCHAIN_PATH}gcc
-            CXX=${TOOLCHAIN_PATH}g++
-            AR=${TOOLCHAIN_PATH}ar
-            LD=${TOOLCHAIN_PATH}gcc
-            AS=${TOOLCHAIN_PATH}as
-            STRIP=${TOOLCHAIN_PATH}strip
-            NM=${TOOLCHAIN_PATH}nm
-
-            if [ -z "${alt_libc}" ]; then
-                alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
-                          awk '{n = split($0,a,"/"); \
-                                split(a[n-1],b,"-"); \
-                                print $0 " " b[2]}' | \
-                          sort -g -k 2 | \
-                          awk '{ print $1 }' | tail -1`
-            fi
-
-            add_cflags "--sysroot=${alt_libc}"
-            add_ldflags "--sysroot=${alt_libc}"
-
-            # linker flag that routes around a CPU bug in some
-            # Cortex-A8 implementations (NDK Dev Guide)
-            add_ldflags "-Wl,--fix-cortex-a8"
-
-            enable_feature pic
-            soft_enable realtime_only
-            if [ ${tgt_isa} = "armv7" ]; then
-                soft_enable runtime_cpu_detect
-            fi
-            if enabled runtime_cpu_detect; then
-                add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
-            fi
+          SDK_PATH=${sdk_path}
+          COMPILER_LOCATION=`find "${SDK_PATH}" \
+                             -name "arm-linux-androideabi-gcc*" -print -quit`
+          TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
+          CC=${TOOLCHAIN_PATH}gcc
+          CXX=${TOOLCHAIN_PATH}g++
+          AR=${TOOLCHAIN_PATH}ar
+          LD=${TOOLCHAIN_PATH}gcc
+          AS=${TOOLCHAIN_PATH}as
+          STRIP=${TOOLCHAIN_PATH}strip
+          NM=${TOOLCHAIN_PATH}nm
+
+          if [ -z "${alt_libc}" ]; then
+            alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
+              awk '{n = split($0,a,"/"); \
+                split(a[n-1],b,"-"); \
+                print $0 " " b[2]}' | \
+                sort -g -k 2 | \
+                awk '{ print $1 }' | tail -1`
+          fi
+
+          add_cflags "--sysroot=${alt_libc}"
+          add_ldflags "--sysroot=${alt_libc}"
+
+          # linker flag that routes around a CPU bug in some
+          # Cortex-A8 implementations (NDK Dev Guide)
+          add_ldflags "-Wl,--fix-cortex-a8"
+
+          enable_feature pic
+          soft_enable realtime_only
+          if [ ${tgt_isa} = "armv7" ]; then
+            soft_enable runtime_cpu_detect
+          fi
+          if enabled runtime_cpu_detect; then
+            add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
+          fi
           ;;
 
         darwin*)
-            XCRUN_FIND="xcrun --sdk iphoneos -find"
-            CXX="$(${XCRUN_FIND} clang++)"
-            CC="$(${XCRUN_FIND} clang)"
-            AR="$(${XCRUN_FIND} ar)"
-            AS="$(${XCRUN_FIND} as)"
-            STRIP="$(${XCRUN_FIND} strip)"
-            NM="$(${XCRUN_FIND} nm)"
-            RANLIB="$(${XCRUN_FIND} ranlib)"
-            AS_SFX=.s
-
-            # Special handling of ld for armv6 because libclang_rt.ios.a does
-            # not contain armv6 support in Apple's clang package:
-            #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
-            # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
-            # renders support for armv6 unnecessary because the 3GS and up
-            # support neon.
-            if [ "${tgt_isa}" = "armv6" ]; then
-                LD="$(${XCRUN_FIND} ld)"
-            else
-                LD="${CXX:-$(${XCRUN_FIND} ld)}"
-            fi
-
-            # ASFLAGS is written here instead of using check_add_asflags
-            # because we need to overwrite all of ASFLAGS and purge the
-            # options that were put in above
-            ASFLAGS="-arch ${tgt_isa} -g"
-
-            alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
-            add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
-            add_ldflags -arch ${tgt_isa}
-
-            if [ "${LD}" = "${CXX}" ]; then
-                add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
-            else
-                add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
-            fi
-
-            for d in lib usr/lib usr/lib/system; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
-            done
-
-            asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
-         ;;
+          XCRUN_FIND="xcrun --sdk iphoneos --find"
+          CXX="$(${XCRUN_FIND} clang++)"
+          CC="$(${XCRUN_FIND} clang)"
+          AR="$(${XCRUN_FIND} ar)"
+          AS="$(${XCRUN_FIND} as)"
+          STRIP="$(${XCRUN_FIND} strip)"
+          NM="$(${XCRUN_FIND} nm)"
+          RANLIB="$(${XCRUN_FIND} ranlib)"
+          AS_SFX=.s
+
+          # Special handling of ld for armv6 because libclang_rt.ios.a does
+          # not contain armv6 support in Apple's clang package:
+          #   Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn).
+          # TODO(tomfinegan): Remove this. Our minimum iOS version (6.0)
+          # renders support for armv6 unnecessary because the 3GS and up
+          # support neon.
+          if [ "${tgt_isa}" = "armv6" ]; then
+            LD="$(${XCRUN_FIND} ld)"
+          else
+            LD="${CXX:-$(${XCRUN_FIND} ld)}"
+          fi
+
+          # ASFLAGS is written here instead of using check_add_asflags
+          # because we need to overwrite all of ASFLAGS and purge the
+          # options that were put in above
+          ASFLAGS="-arch ${tgt_isa} -g"
+
+          add_cflags -arch ${tgt_isa}
+          add_ldflags -arch ${tgt_isa}
+
+          alt_libc="$(show_darwin_sdk_path iphoneos)"
+          if [ -d "${alt_libc}" ]; then
+            add_cflags -isysroot ${alt_libc}
+          fi
+
+          if [ "${LD}" = "${CXX}" ]; then
+            add_ldflags -miphoneos-version-min="${IOS_VERSION_MIN}"
+          else
+            add_ldflags -ios_version_min "${IOS_VERSION_MIN}"
+          fi
+
+          for d in lib usr/lib usr/lib/system; do
+            try_dir="${alt_libc}/${d}"
+            [ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
+          done
+
+          asm_conversion_cmd="${source_path}/build/make/ads2gas_apple.pl"
+          ;;
 
         linux*)
-            enable_feature linux
-            if enabled rvct; then
-                # Check if we have CodeSourcery GCC in PATH. Needed for
-                # libraries
-                hash arm-none-linux-gnueabi-gcc 2>&- || \
-                  die "Couldn't find CodeSourcery GCC from PATH"
-
-                # Use armcc as a linker to enable translation of
-                # some gcc specific options such as -lm and -lpthread.
-                LD="armcc --translate_gcc"
-
-                # create configuration file (uses path to CodeSourcery GCC)
-                armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
-
-                add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
-                add_asflags --no_hide_all --apcs=/interwork
-                add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
-                enabled pic && add_cflags --apcs=/fpic
-                enabled pic && add_asflags --apcs=/fpic
-                enabled shared && add_cflags --shared
-            fi
-        ;;
-
-        esac
-    ;;
+          enable_feature linux
+          if enabled rvct; then
+            # Check if we have CodeSourcery GCC in PATH. Needed for
+            # libraries
+            hash arm-none-linux-gnueabi-gcc 2>&- || \
+              die "Couldn't find CodeSourcery GCC from PATH"
+
+            # Use armcc as a linker to enable translation of
+            # some gcc specific options such as -lm and -lpthread.
+            LD="armcc --translate_gcc"
+
+            # create configuration file (uses path to CodeSourcery GCC)
+            armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
+
+            add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+            add_asflags --no_hide_all --apcs=/interwork
+            add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
+            enabled pic && add_cflags --apcs=/fpic
+            enabled pic && add_asflags --apcs=/fpic
+            enabled shared && add_cflags --shared
+          fi
+          ;;
+      esac
+      ;;
     mips*)
-        link_with_cc=gcc
-        setup_gnu_toolchain
-        tune_cflags="-mtune="
-        if enabled dspr2; then
-            check_add_cflags -mips32r2 -mdspr2
-            disable_feature fast_unaligned
-        fi
-        check_add_cflags -march=${tgt_isa}
-        check_add_asflags -march=${tgt_isa}
-        check_add_asflags -KPIC
-    ;;
-    ppc*)
-        enable_feature ppc
-        bits=${tgt_isa##ppc}
-        link_with_cc=gcc
-        setup_gnu_toolchain
-        add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-        soft_enable altivec
-        enabled altivec && add_cflags -maltivec
-
-        case "$tgt_os" in
-        linux*)
-            add_asflags -maltivec -mregnames -I"\$(dir \$<)linux"
-        ;;
-        darwin*)
-            darwin_arch="-arch ppc"
-            enabled ppc64 && darwin_arch="${darwin_arch}64"
-            add_cflags  ${darwin_arch} -m${bits} -fasm-blocks
-            add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-            add_ldflags ${darwin_arch} -m${bits}
-            enabled altivec && add_cflags -faltivec
-        ;;
-        esac
-    ;;
-    x86*)
-        case  ${tgt_os} in
-            win*)
-                enabled gcc && add_cflags -fno-common
-                ;;
-            solaris*)
-                CC=${CC:-${CROSS}gcc}
-                CXX=${CXX:-${CROSS}g++}
-                LD=${LD:-${CROSS}gcc}
-                CROSS=${CROSS:-g}
-                ;;
-            os2)
-                AS=${AS:-nasm}
-                ;;
-        esac
-
-        AS="${alt_as:-${AS:-auto}}"
-        case  ${tgt_cc} in
-            icc*)
-                CC=${CC:-icc}
-                LD=${LD:-icc}
-                setup_gnu_toolchain
-                add_cflags -use-msasm  # remove -use-msasm too?
-                # add -no-intel-extensions to suppress warning #10237
-                # refer to http://software.intel.com/en-us/forums/topic/280199
-                add_ldflags -i-static -no-intel-extensions
-                enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
-                enabled x86_64 && AR=xiar
-                case ${tune_cpu} in
-                    atom*)
-                        tune_cflags="-x"
-                        tune_cpu="SSE3_ATOM"
-                    ;;
-                    *)
-                        tune_cflags="-march="
-                    ;;
-                esac
+      link_with_cc=gcc
+      setup_gnu_toolchain
+      tune_cflags="-mtune="
+      if enabled dspr2; then
+        check_add_cflags -mips32r2 -mdspr2
+      fi
+
+      if enabled runtime_cpu_detect; then
+        disable_feature runtime_cpu_detect
+      fi
+
+      if [ -n "${tune_cpu}" ]; then
+        case ${tune_cpu} in
+          p5600)
+            check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
+            check_add_cflags -msched-weight -mhard-float -mfp64
+            check_add_asflags -mips32r5 -mhard-float -mfp64
+            check_add_ldflags -mfp64
             ;;
-            gcc*)
-                link_with_cc=gcc
-                tune_cflags="-march="
-                setup_gnu_toolchain
-                #for 32 bit x86 builds, -O3 did not turn on this flag
-                enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
-            ;;
-            vs*)
-                # When building with Microsoft Visual Studio the assembler is
-                # invoked directly. Checking at configure time is unnecessary.
-                # Skip the check by setting AS arbitrarily
-                AS=msvs
-                msvs_arch_dir=x86-msvs
-                vc_version=${tgt_cc##vs}
-                case $vc_version in
-                    7|8|9|10)
-                         echo "${tgt_cc} does not support avx/avx2, disabling....."
-                         RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
-                         soft_disable avx
-                         soft_disable avx2
-                    ;;
-                esac
+          i6400)
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags  -mload-store-pairs -mhard-float -mfp64
+            check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
+            check_add_ldflags -mips64r6 -mabi=64 -mfp64
             ;;
         esac
 
-        bits=32
-        enabled x86_64 && bits=64
-        check_cpp <<EOF && bits=x32
-#ifndef __ILP32__
+        if enabled msa; then
+          add_cflags -mmsa
+          add_asflags -mmsa
+          add_ldflags -mmsa
+        fi
+      fi
+
+      check_add_cflags -march=${tgt_isa}
+      check_add_asflags -march=${tgt_isa}
+      check_add_asflags -KPIC
+      ;;
+    x86*)
+      case  ${tgt_os} in
+        win*)
+          enabled gcc && add_cflags -fno-common
+          ;;
+        solaris*)
+          CC=${CC:-${CROSS}gcc}
+          CXX=${CXX:-${CROSS}g++}
+          LD=${LD:-${CROSS}gcc}
+          CROSS=${CROSS:-g}
+          ;;
+        os2)
+          AS=${AS:-nasm}
+          ;;
+      esac
+
+      AS="${alt_as:-${AS:-auto}}"
+      case  ${tgt_cc} in
+        icc*)
+          CC=${CC:-icc}
+          LD=${LD:-icc}
+          setup_gnu_toolchain
+          add_cflags -use-msasm  # remove -use-msasm too?
+          # add -no-intel-extensions to suppress warning #10237
+          # refer to http://software.intel.com/en-us/forums/topic/280199
+          add_ldflags -i-static -no-intel-extensions
+          enabled x86_64 && add_cflags -ipo -static -O3 -no-prec-div
+          enabled x86_64 && AR=xiar
+          case ${tune_cpu} in
+            atom*)
+              tune_cflags="-x"
+              tune_cpu="SSE3_ATOM"
+              ;;
+            *)
+              tune_cflags="-march="
+              ;;
+          esac
+          ;;
+        gcc*)
+          link_with_cc=gcc
+          tune_cflags="-march="
+          setup_gnu_toolchain
+          #for 32 bit x86 builds, -O3 did not turn on this flag
+          enabled optimizations && disabled gprof && check_add_cflags -fomit-frame-pointer
+          ;;
+        vs*)
+          # When building with Microsoft Visual Studio the assembler is
+          # invoked directly. Checking at configure time is unnecessary.
+          # Skip the check by setting AS arbitrarily
+          AS=msvs
+          msvs_arch_dir=x86-msvs
+          vc_version=${tgt_cc##vs}
+          case $vc_version in
+            7|8|9|10)
+              echo "${tgt_cc} does not support avx/avx2, disabling....."
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
+              soft_disable avx
+              soft_disable avx2
+              ;;
+          esac
+          ;;
+      esac
+
+      bits=32
+      enabled x86_64 && bits=64
+      check_cpp <<EOF && bits=x32
+#if !defined(__ILP32__) || !defined(__x86_64__)
 #error "not x32"
 #endif
 EOF
-        case ${tgt_cc} in
-            gcc*)
-                add_cflags -m${bits}
-                add_ldflags -m${bits}
-            ;;
-        esac
-
-        soft_enable runtime_cpu_detect
-        # We can't use 'check_cflags' until the compiler is configured and CC is
-        # populated.
-        check_gcc_machine_option mmx
-        check_gcc_machine_option sse
-        check_gcc_machine_option sse2
-        check_gcc_machine_option sse3
-        check_gcc_machine_option ssse3
-        check_gcc_machine_option sse4 sse4_1
-        check_gcc_machine_option avx
-        check_gcc_machine_option avx2
-
-        case "${AS}" in
-            auto|"")
-                which nasm >/dev/null 2>&1 && AS=nasm
-                which yasm >/dev/null 2>&1 && AS=yasm
-                [ "${AS}" = auto ] || [ -z "${AS}" ] \
-                    && die "Neither yasm nor nasm have been found"
-            ;;
-        esac
-        log_echo "  using $AS"
-        [ "${AS##*/}" = nasm ] && add_asflags -Ox
-        AS_SFX=.asm
-        case  ${tgt_os} in
-            win32)
-                add_asflags -f win32
-                enabled debug && add_asflags -g cv8
-                EXE_SFX=.exe
-            ;;
-            win64)
-                add_asflags -f x64
-                enabled debug && add_asflags -g cv8
-                EXE_SFX=.exe
-            ;;
-            linux*|solaris*|android*)
-                add_asflags -f elf${bits}
-                enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
-                enabled debug && [ "${AS}" = nasm ] && add_asflags -g
-                [ "${AS##*/}" = nasm ] && check_asm_align
-            ;;
-            darwin*)
-                add_asflags -f macho${bits}
-                enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
-                add_cflags  ${darwin_arch}
-                add_ldflags ${darwin_arch}
-                # -mdynamic-no-pic is still a bit of voodoo -- it was required at
-                # one time, but does not seem to be now, and it breaks some of the
-                # code that still relies on inline assembly.
-                # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
-                enabled icc && ! enabled pic && add_cflags -fno-pic
-            ;;
-            iphonesimulator)
-                add_asflags -f macho${bits}
-                enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
-                add_cflags  ${sim_arch}
-                add_ldflags ${sim_arch}
-           ;;
-            os2)
-                add_asflags -f aout
-                enabled debug && add_asflags -g
-                EXE_SFX=.exe
-            ;;
-            *) log "Warning: Unknown os $tgt_os while setting up $AS flags"
-            ;;
-        esac
-    ;;
+      case ${tgt_cc} in
+        gcc*)
+          add_cflags -m${bits}
+          add_ldflags -m${bits}
+          ;;
+      esac
+
+      soft_enable runtime_cpu_detect
+      # We can't use 'check_cflags' until the compiler is configured and CC is
+      # populated.
+      check_gcc_machine_option mmx
+      check_gcc_machine_option sse
+      check_gcc_machine_option sse2
+      check_gcc_machine_option sse3
+      check_gcc_machine_option ssse3
+      check_gcc_machine_option sse4 sse4_1
+      check_gcc_machine_option avx
+      check_gcc_machine_option avx2
+
+      case "${AS}" in
+        auto|"")
+          which nasm >/dev/null 2>&1 && AS=nasm
+          which yasm >/dev/null 2>&1 && AS=yasm
+          if [ "${AS}" = nasm ] ; then
+            # Apple ships version 0.98 of nasm through at least Xcode 6. Revisit
+            # this check if they start shipping a compatible version.
+            apple=`nasm -v | grep "Apple"`
+            [ -n "${apple}" ] \
+              && echo "Unsupported version of nasm: ${apple}" \
+              && AS=""
+          fi
+          [ "${AS}" = auto ] || [ -z "${AS}" ] \
+            && die "Neither yasm nor nasm have been found"
+          ;;
+      esac
+      log_echo "  using $AS"
+      [ "${AS##*/}" = nasm ] && add_asflags -Ox
+      AS_SFX=.asm
+      case  ${tgt_os} in
+        win32)
+          add_asflags -f win32
+          enabled debug && add_asflags -g cv8
+          EXE_SFX=.exe
+          ;;
+        win64)
+          add_asflags -f x64
+          enabled debug && add_asflags -g cv8
+          EXE_SFX=.exe
+          ;;
+        linux*|solaris*|android*)
+          add_asflags -f elf${bits}
+          enabled debug && [ "${AS}" = yasm ] && add_asflags -g dwarf2
+          enabled debug && [ "${AS}" = nasm ] && add_asflags -g
+          [ "${AS##*/}" = nasm ] && check_asm_align
+          ;;
+        darwin*)
+          add_asflags -f macho${bits}
+          enabled x86 && darwin_arch="-arch i386" || darwin_arch="-arch x86_64"
+          add_cflags  ${darwin_arch}
+          add_ldflags ${darwin_arch}
+          # -mdynamic-no-pic is still a bit of voodoo -- it was required at
+          # one time, but does not seem to be now, and it breaks some of the
+          # code that still relies on inline assembly.
+          # enabled icc && ! enabled pic && add_cflags -fno-pic -mdynamic-no-pic
+          enabled icc && ! enabled pic && add_cflags -fno-pic
+          ;;
+        iphonesimulator)
+          add_asflags -f macho${bits}
+          enabled x86 && sim_arch="-arch i386" || sim_arch="-arch x86_64"
+          add_cflags  ${sim_arch}
+          add_ldflags ${sim_arch}
+          ;;
+        os2)
+          add_asflags -f aout
+          enabled debug && add_asflags -g
+          EXE_SFX=.exe
+          ;;
+        *)
+          log "Warning: Unknown os $tgt_os while setting up $AS flags"
+          ;;
+      esac
+      ;;
     universal*|*-gcc|generic-gnu)
-        link_with_cc=gcc
-        enable_feature gcc
-    setup_gnu_toolchain
-    ;;
-    esac
+      link_with_cc=gcc
+      enable_feature gcc
+      setup_gnu_toolchain
+      ;;
+  esac
 
-    # Try to enable CPU specific tuning
-    if [ -n "${tune_cpu}" ]; then
-        if [ -n "${tune_cflags}" ]; then
-            check_add_cflags ${tune_cflags}${tune_cpu} || \
-                die "Requested CPU '${tune_cpu}' not supported by compiler"
-        fi
+  # Try to enable CPU specific tuning
+  if [ -n "${tune_cpu}" ]; then
+    if [ -n "${tune_cflags}" ]; then
+      check_add_cflags ${tune_cflags}${tune_cpu} || \
+        die "Requested CPU '${tune_cpu}' not supported by compiler"
+    fi
     if [ -n "${tune_asflags}" ]; then
-            check_add_asflags ${tune_asflags}${tune_cpu} || \
-                die "Requested CPU '${tune_cpu}' not supported by assembler"
-        fi
+      check_add_asflags ${tune_asflags}${tune_cpu} || \
+        die "Requested CPU '${tune_cpu}' not supported by assembler"
+    fi
     if [ -z "${tune_cflags}${tune_asflags}" ]; then
-            log_echo "Warning: CPU tuning not supported by this toolchain"
-        fi
+      log_echo "Warning: CPU tuning not supported by this toolchain"
     fi
-
-    if enabled debug; then
-        check_add_cflags -g && check_add_ldflags -g
+  fi
+
+  if enabled debug; then
+    check_add_cflags -g && check_add_ldflags -g
+  else
+    check_add_cflags -DNDEBUG
+  fi
+
+  enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
+  enabled gcov &&
+    check_add_cflags -fprofile-arcs -ftest-coverage &&
+    check_add_ldflags -fprofile-arcs -ftest-coverage
+
+  if enabled optimizations; then
+    if enabled rvct; then
+      enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
     else
-        check_add_cflags -DNDEBUG
-    fi
-
-    enabled gprof && check_add_cflags -pg && check_add_ldflags -pg
-    enabled gcov &&
-        check_add_cflags -fprofile-arcs -ftest-coverage &&
-        check_add_ldflags -fprofile-arcs -ftest-coverage
-
-    if enabled optimizations; then
-        if enabled rvct; then
-            enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
-        else
-            enabled small && check_add_cflags -O2 ||  check_add_cflags -O3
-        fi
+      enabled small && check_add_cflags -O2 ||  check_add_cflags -O3
     fi
+  fi
 
-    tgt_os_no_version=$(echo "${tgt_os}" | tr -d "[0-9]")
-    # Default use_x86inc to yes when we are 64 bit, non-pic, or on any
-    # non-Darwin target.
-    if [ "${tgt_isa}" = "x86_64" ] || [ "${pic}" != "yes" ] || \
-            [ "${tgt_os_no_version}" != "darwin" ]; then
-        soft_enable use_x86inc
-    fi
+  if [ "${tgt_isa}" = "x86_64" ] || [ "${tgt_isa}" = "x86" ]; then
+    soft_enable use_x86inc
+  fi
 
-    # Position Independent Code (PIC) support, for building relocatable
-    # shared objects
-    enabled gcc && enabled pic && check_add_cflags -fPIC
+  # Position Independent Code (PIC) support, for building relocatable
+  # shared objects
+  enabled gcc && enabled pic && check_add_cflags -fPIC
 
-    # Work around longjmp interception on glibc >= 2.11, to improve binary
-    # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
-    enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
+  # Work around longjmp interception on glibc >= 2.11, to improve binary
+  # compatibility. See http://code.google.com/p/webm/issues/detail?id=166
+  enabled linux && check_add_cflags -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0
 
-    # Check for strip utility variant
-    ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
+  # Check for strip utility variant
+  ${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable_feature gnu_strip
 
-    # Try to determine target endianness
-    check_cc <<EOF
-    unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
+  # Try to determine target endianness
+  check_cc <<EOF
+unsigned int e = 'O'<<24 | '2'<<16 | 'B'<<8 | 'E';
 EOF
     [ -f "${TMP_O}" ] && od -A n -t x1 "${TMP_O}" | tr -d '\n' |
         grep '4f *32 *42 *45' >/dev/null 2>&1 && enable_feature big_endian
 
     # Try to find which inline keywords are supported
     check_cc <<EOF && INLINE="inline"
-    static inline function() {}
+static inline function() {}
 EOF
-    check_cc <<EOF && INLINE="__inline__ __attribute__((always_inline))"
-    static __attribute__((always_inline)) function() {}
-EOF
-
-    # Almost every platform uses pthreads.
-    if enabled multithread; then
-        case ${toolchain} in
-            *-win*-vs*);;
-            *-android-gcc);;
-            *) check_header pthread.h && add_extralibs -lpthread
-        esac
-    fi
 
-    # only for MIPS platforms
+  # Almost every platform uses pthreads.
+  if enabled multithread; then
     case ${toolchain} in
-        mips*)
-            if enabled dspr2; then
-                if enabled big_endian; then
-                    echo "dspr2 optimizations are available only for little endian platforms"
-                    disable_feature dspr2
-                fi
-            fi
+      *-win*-vs*)
+        ;;
+      *-android-gcc)
+        ;;
+      *)
+        check_header pthread.h && add_extralibs -lpthread
         ;;
     esac
+  fi
 
-    # glibc needs these
-    if enabled linux; then
-        add_cflags -D_LARGEFILE_SOURCE
-        add_cflags -D_FILE_OFFSET_BITS=64
-    fi
-
-    # append any user defined extra cflags
-    if [ -n "${extra_cflags}" ] ; then
-        check_add_cflags ${extra_cflags} || \
-        die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
-    fi
+  # only for MIPS platforms
+  case ${toolchain} in
+    mips*)
+      if enabled big_endian; then
+        if enabled dspr2; then
+          echo "dspr2 optimizations are available only for little endian platforms"
+          disable_feature dspr2
+        fi
+        if enabled msa; then
+          echo "msa optimizations are available only for little endian platforms"
+          disable_feature msa
+        fi
+      fi
+      ;;
+  esac
+
+  # glibc needs these
+  if enabled linux; then
+    add_cflags -D_LARGEFILE_SOURCE
+    add_cflags -D_FILE_OFFSET_BITS=64
+  fi
+
+  # append any user defined extra cflags
+  if [ -n "${extra_cflags}" ] ; then
+    check_add_cflags ${extra_cflags} || \
+    die "Requested extra CFLAGS '${extra_cflags}' not supported by compiler"
+  fi
 }
 
 process_toolchain() {
-    process_common_toolchain
+  process_common_toolchain
 }
 
 print_config_mk() {
-    saved_prefix="${prefix}"
-    prefix=$1
-    makefile=$2
-    shift 2
-    for cfg; do
-        if enabled $cfg; then
-            upname="`toupper $cfg`"
-            echo "${prefix}_${upname}=yes" >> $makefile
-        fi
-    done
-    prefix="${saved_prefix}"
+  saved_prefix="${prefix}"
+  prefix=$1
+  makefile=$2
+  shift 2
+  for cfg; do
+    if enabled $cfg; then
+      upname="`toupper $cfg`"
+      echo "${prefix}_${upname}=yes" >> $makefile
+    fi
+  done
+  prefix="${saved_prefix}"
 }
 
 print_config_h() {
-    saved_prefix="${prefix}"
-    prefix=$1
-    header=$2
-    shift 2
-    for cfg; do
-        upname="`toupper $cfg`"
-        if enabled $cfg; then
-            echo "#define ${prefix}_${upname} 1" >> $header
-        else
-            echo "#define ${prefix}_${upname} 0" >> $header
-        fi
-    done
-    prefix="${saved_prefix}"
+  saved_prefix="${prefix}"
+  prefix=$1
+  header=$2
+  shift 2
+  for cfg; do
+    upname="`toupper $cfg`"
+    if enabled $cfg; then
+      echo "#define ${prefix}_${upname} 1" >> $header
+    else
+      echo "#define ${prefix}_${upname} 0" >> $header
+    fi
+  done
+  prefix="${saved_prefix}"
 }
 
 print_config_vars_h() {
-    header=$1
-    shift
-    while [ $# -gt 0 ]; do
-        upname="`toupper $1`"
-        echo "#define ${upname} $2" >> $header
-        shift 2
-    done
+  header=$1
+  shift
+  while [ $# -gt 0 ]; do
+    upname="`toupper $1`"
+    echo "#define ${upname} $2" >> $header
+    shift 2
+  done
 }
 
 print_webm_license() {
-    saved_prefix="${prefix}"
-    destination=$1
-    prefix="$2"
-    suffix="$3"
-    shift 3
-    cat <<EOF > ${destination}
+  saved_prefix="${prefix}"
+  destination=$1
+  prefix="$2"
+  suffix="$3"
+  shift 3
+  cat <<EOF > ${destination}
 ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
 ${prefix} ${suffix}
 ${prefix} Use of this source code is governed by a BSD-style license${suffix}
@@ -1390,43 +1384,43 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s
 ${prefix} in the file PATENTS.  All contributing project authors may${suffix}
 ${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
 EOF
-    prefix="${saved_prefix}"
+  prefix="${saved_prefix}"
 }
 
 process_targets() {
-    true;
+  true;
 }
 
 process_detect() {
-    true;
+  true;
 }
 
 enable_feature logging
 logfile="config.log"
 self=$0
 process() {
-    cmdline_args="$@"
-    process_cmdline "$@"
-    if enabled child; then
-        echo "# ${self} $@" >> ${logfile}
-    else
-        echo "# ${self} $@" > ${logfile}
-    fi
-    post_process_common_cmdline
-    post_process_cmdline
-    process_toolchain
-    process_detect
-    process_targets
-
-    OOT_INSTALLS="${OOT_INSTALLS}"
-    if enabled source_path_used; then
-    # Prepare the PWD for building.
-    for f in ${OOT_INSTALLS}; do
-            install -D "${source_path}/$f" "$f"
-    done
-    fi
-    cp "${source_path}/build/make/Makefile" .
-
-    clean_temp_files
-    true
+  cmdline_args="$@"
+  process_cmdline "$@"
+  if enabled child; then
+    echo "# ${self} $@" >> ${logfile}
+  else
+    echo "# ${self} $@" > ${logfile}
+  fi
+  post_process_common_cmdline
+  post_process_cmdline
+  process_toolchain
+  process_detect
+  process_targets
+
+  OOT_INSTALLS="${OOT_INSTALLS}"
+  if enabled source_path_used; then
+  # Prepare the PWD for building.
+  for f in ${OOT_INSTALLS}; do
+    install -D "${source_path}/$f" "$f"
+  done
+  fi
+  cp "${source_path}/build/make/Makefile" .
+
+  clean_temp_files
+  true
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh
index 79072259349..dcce78255d4 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_proj.sh
@@ -295,23 +295,8 @@ generate_vcproj() {
         case "$target" in
             x86*)
                 case "$name" in
-                    obj_int_extract)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="0" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
-                            RuntimeLibrary="$debug_runtime" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="1" \
-                            $warn_64bit \
-                    ;;
                     vpx)
                         tag Tool \
-                            Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \
-
-                        tag Tool \
                             Name="VCCLCompilerTool" \
                             Optimization="0" \
                             AdditionalIncludeDirectories="$incs" \
@@ -347,11 +332,6 @@ generate_vcproj() {
                 case "$target" in
                     x86*)
                         case "$name" in
-                            obj_int_extract)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    GenerateDebugInformation="true" \
-                            ;;
                             *)
                                 tag Tool \
                                     Name="VCLinkerTool" \
@@ -400,25 +380,8 @@ generate_vcproj() {
         case "$target" in
             x86*)
                 case "$name" in
-                    obj_int_extract)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="2" \
-                            FavorSizeorSpeed="1" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
-                            RuntimeLibrary="$release_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="0" \
-                            $warn_64bit \
-                    ;;
                     vpx)
                         tag Tool \
-                            Name="VCPreBuildEventTool" \
-                            CommandLine="call obj_int_extract.bat &quot;$src_path_bare&quot; $plat_no_ws\\\$(ConfigurationName)" \
-
-                        tag Tool \
                             Name="VCCLCompilerTool" \
                             Optimization="2" \
                             FavorSizeorSpeed="1" \
@@ -456,11 +419,6 @@ generate_vcproj() {
                 case "$target" in
                     x86*)
                         case "$name" in
-                            obj_int_extract)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    GenerateDebugInformation="true" \
-                            ;;
                             *)
                                 tag Tool \
                                     Name="VCLinkerTool" \
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
index 56b9a3b50b4..643ebd634be 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -262,15 +262,9 @@ case "$target" in
         asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
     arm*)
+        platforms[0]="ARM"
         asm_Debug_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
         asm_Release_cmdline="armasm -nologo &quot;%(FullPath)&quot;"
-        if [ "$name" = "obj_int_extract" ]; then
-            # We don't want to build this tool for the target architecture,
-            # but for an architecture we can run locally during the build.
-            platforms[0]="Win32"
-        else
-            platforms[0]="ARM"
-        fi
     ;;
     *) die "Unsupported target $target!"
     ;;
@@ -400,23 +394,13 @@ generate_vcxproj() {
                 if [ "$hostplat" == "ARM" ]; then
                     hostplat=Win32
                 fi
-                open_tag PreBuildEvent
-                tag_content Command "call obj_int_extract.bat &quot;$src_path_bare&quot; $hostplat\\\$(Configuration)"
-                close_tag PreBuildEvent
             fi
             open_tag ClCompile
             if [ "$config" = "Debug" ]; then
                 opt=Disabled
                 runtime=$debug_runtime
                 curlibs=$debug_libs
-                case "$name" in
-                obj_int_extract)
-                    debug=DEBUG
-                    ;;
-                *)
-                    debug=_DEBUG
-                    ;;
-                esac
+                debug=_DEBUG
             else
                 opt=MaxSpeed
                 runtime=$release_runtime
@@ -424,14 +408,7 @@ generate_vcxproj() {
                 tag_content FavorSizeOrSpeed Speed
                 debug=NDEBUG
             fi
-            case "$name" in
-            obj_int_extract)
-                extradefines=";_CONSOLE"
-                ;;
-            *)
-                extradefines=";$defines"
-                ;;
-            esac
+            extradefines=";$defines"
             tag_content Optimization $opt
             tag_content AdditionalIncludeDirectories "$incs;%(AdditionalIncludeDirectories)"
             tag_content PreprocessorDefinitions "WIN32;$debug;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE$extradefines;%(PreprocessorDefinitions)"
@@ -451,10 +428,6 @@ generate_vcxproj() {
             case "$proj_kind" in
             exe)
                 open_tag Link
-                if [ "$name" != "obj_int_extract" ]; then
-                    tag_content AdditionalDependencies "$curlibs;%(AdditionalDependencies)"
-                    tag_content AdditionalLibraryDirectories "$libdirs;%(AdditionalLibraryDirectories)"
-                fi
                 tag_content GenerateDebugInformation true
                 # Console is the default normally, but if
                 # AppContainerApplication is set, we need to override it.
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh b/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh
index fb91b87894b..89fa681864a 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/iosbuild.sh
@@ -18,15 +18,19 @@ set -e
 devnull='> /dev/null 2>&1'
 
 BUILD_ROOT="_iosbuild"
+CONFIGURE_ARGS="--disable-docs
+                --disable-examples
+                --disable-libyuv
+                --disable-unit-tests"
 DIST_DIR="_dist"
 FRAMEWORK_DIR="VPX.framework"
 HEADER_DIR="${FRAMEWORK_DIR}/Headers/vpx"
 MAKE_JOBS=1
-LIBVPX_SOURCE_DIR=$(dirname "$0" | sed -e s,/build/make,,)
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd ${SCRIPT_DIR}/../..; pwd)
 LIPO=$(xcrun -sdk iphoneos${SDK} -find lipo)
 ORIG_PWD="$(pwd)"
 TARGETS="arm64-darwin-gcc
-         armv6-darwin-gcc
          armv7-darwin-gcc
          armv7s-darwin-gcc
          x86-iphonesimulator-gcc
@@ -42,8 +46,8 @@ build_target() {
 
   mkdir "${target}"
   cd "${target}"
-  eval "../../${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
-      --disable-docs ${EXTRA_CONFIGURE_ARGS} ${devnull}
+  eval "${LIBVPX_SOURCE_DIR}/configure" --target="${target}" \
+    ${CONFIGURE_ARGS} ${EXTRA_CONFIGURE_ARGS} ${devnull}
   export DIST_DIR
   eval make -j ${MAKE_JOBS} dist ${devnull}
   cd "${old_pwd}"
@@ -58,9 +62,6 @@ target_to_preproc_symbol() {
     arm64-*)
       echo "__aarch64__"
       ;;
-    armv6-*)
-      echo "__ARM_ARCH_6__"
-      ;;
     armv7-*)
       echo "__ARM_ARCH_7A__"
       ;;
@@ -176,8 +177,13 @@ build_framework() {
 # Trap function. Cleans up the subtree used to build all targets contained in
 # $TARGETS.
 cleanup() {
+  local readonly res=$?
   cd "${ORIG_PWD}"
 
+  if [ $res -ne 0 ]; then
+    elog "build exited with error ($res)"
+  fi
+
   if [ "${PRESERVE_BUILD_OUTPUT}" != "yes" ]; then
     rm -rf "${BUILD_ROOT}"
   fi
@@ -187,14 +193,21 @@ iosbuild_usage() {
 cat << EOF
   Usage: ${0##*/} [arguments]
     --help: Display this message and exit.
+    --extra-configure-args <args>: Extra args to pass when configuring libvpx.
     --jobs: Number of make jobs.
     --preserve-build-output: Do not delete the build directory.
     --show-build-output: Show output from each library build.
+    --targets <targets>: Override default target list. Defaults:
+         ${TARGETS}
     --verbose: Output information about the environment and each stage of the
                build.
 EOF
 }
 
+elog() {
+  echo "${0##*/} failed because: $@" 1>&2
+}
+
 vlog() {
   if [ "${VERBOSE}" = "yes" ]; then
     echo "$@"
@@ -224,6 +237,10 @@ while [ -n "$1" ]; do
     --show-build-output)
       devnull=
       ;;
+    --targets)
+      TARGETS="$2"
+      shift
+      ;;
     --verbose)
       VERBOSE=yes
       ;;
@@ -239,6 +256,7 @@ if [ "${VERBOSE}" = "yes" ]; then
 cat << EOF
   BUILD_ROOT=${BUILD_ROOT}
   DIST_DIR=${DIST_DIR}
+  CONFIGURE_ARGS=${CONFIGURE_ARGS}
   EXTRA_CONFIGURE_ARGS=${EXTRA_CONFIGURE_ARGS}
   FRAMEWORK_DIR=${FRAMEWORK_DIR}
   HEADER_DIR=${HEADER_DIR}
@@ -252,3 +270,5 @@ EOF
 fi
 
 build_framework "${TARGETS}"
+echo "Successfully built '${FRAMEWORK_DIR}' for:"
+echo "         ${TARGETS}"
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/obj_int_extract.c b/chromium/third_party/libvpx/source/libvpx/build/make/obj_int_extract.c
deleted file mode 100644
index 2e50f387fa5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/build/make/obj_int_extract.c
+++ /dev/null
@@ -1,857 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-typedef enum {
-  OUTPUT_FMT_PLAIN,
-  OUTPUT_FMT_RVDS,
-  OUTPUT_FMT_GAS,
-  OUTPUT_FMT_C_HEADER,
-} output_fmt_t;
-
-int log_msg(const char *fmt, ...) {
-  int res;
-  va_list ap;
-  va_start(ap, fmt);
-  res = vfprintf(stderr, fmt, ap);
-  va_end(ap);
-  return res;
-}
-
-#if defined(__GNUC__) && __GNUC__
-
-#if defined(FORCE_PARSE_ELF)
-
-#if defined(__MACH__)
-#undef __MACH__
-#endif
-
-#if !defined(__ELF__)
-#define __ELF__
-#endif
-#endif
-
-#if defined(__MACH__)
-
-#include <mach-o/loader.h>
-#include <mach-o/nlist.h>
-
-int print_macho_equ(output_fmt_t mode, uint8_t* name, int val) {
-  switch (mode) {
-    case OUTPUT_FMT_RVDS:
-      printf("%-40s EQU %5d\n", name, val);
-      return 0;
-    case OUTPUT_FMT_GAS:
-      printf(".set %-40s, %5d\n", name, val);
-      return 0;
-    case OUTPUT_FMT_C_HEADER:
-      printf("#define %-40s %5d\n", name, val);
-      return 0;
-    default:
-      log_msg("Unsupported mode: %d", mode);
-      return 1;
-  }
-}
-
-int parse_macho(uint8_t *base_buf, size_t sz, output_fmt_t mode) {
-  int i, j;
-  struct mach_header header;
-  uint8_t *buf = base_buf;
-  int base_data_section = 0;
-  int bits = 0;
-
-  /* We can read in mach_header for 32 and 64 bit architectures
-   * because it's identical to mach_header_64 except for the last
-   * element (uint32_t reserved), which we don't use. Then, when
-   * we know which architecture we're looking at, increment buf
-   * appropriately.
-   */
-  memcpy(&header, buf, sizeof(struct mach_header));
-
-  if (header.magic == MH_MAGIC) {
-    if (header.cputype == CPU_TYPE_ARM
-        || header.cputype == CPU_TYPE_X86) {
-      bits = 32;
-      buf += sizeof(struct mach_header);
-    } else {
-      log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_[ARM|X86].\n");
-      goto bail;
-    }
-  } else if (header.magic == MH_MAGIC_64) {
-    if (header.cputype == CPU_TYPE_X86_64) {
-      bits = 64;
-      buf += sizeof(struct mach_header_64);
-    } else {
-      log_msg("Bad cputype for object file. Currently only tested for CPU_TYPE_X86_64.\n");
-      goto bail;
-    }
-  } else {
-    log_msg("Bad magic number for object file. 0x%x or 0x%x expected, 0x%x found.\n",
-            MH_MAGIC, MH_MAGIC_64, header.magic);
-    goto bail;
-  }
-
-  if (header.filetype != MH_OBJECT) {
-    log_msg("Bad filetype for object file. Currently only tested for MH_OBJECT.\n");
-    goto bail;
-  }
-
-  for (i = 0; i < header.ncmds; i++) {
-    struct load_command lc;
-
-    memcpy(&lc, buf, sizeof(struct load_command));
-
-    if (lc.cmd == LC_SEGMENT) {
-      uint8_t *seg_buf = buf;
-      struct section s;
-      struct segment_command seg_c;
-
-      memcpy(&seg_c, seg_buf, sizeof(struct segment_command));
-      seg_buf += sizeof(struct segment_command);
-
-      /* Although each section is given it's own offset, nlist.n_value
-       * references the offset of the first section. This isn't
-       * apparent without debug information because the offset of the
-       * data section is the same as the first section. However, with
-       * debug sections mixed in, the offset of the debug section
-       * increases but n_value still references the first section.
-       */
-      if (seg_c.nsects < 1) {
-        log_msg("Not enough sections\n");
-        goto bail;
-      }
-
-      memcpy(&s, seg_buf, sizeof(struct section));
-      base_data_section = s.offset;
-    } else if (lc.cmd == LC_SEGMENT_64) {
-      uint8_t *seg_buf = buf;
-      struct section_64 s;
-      struct segment_command_64 seg_c;
-
-      memcpy(&seg_c, seg_buf, sizeof(struct segment_command_64));
-      seg_buf += sizeof(struct segment_command_64);
-
-      /* Explanation in LG_SEGMENT */
-      if (seg_c.nsects < 1) {
-        log_msg("Not enough sections\n");
-        goto bail;
-      }
-
-      memcpy(&s, seg_buf, sizeof(struct section_64));
-      base_data_section = s.offset;
-    } else if (lc.cmd == LC_SYMTAB) {
-      if (base_data_section != 0) {
-        struct symtab_command sc;
-        uint8_t *sym_buf = base_buf;
-        uint8_t *str_buf = base_buf;
-
-        memcpy(&sc, buf, sizeof(struct symtab_command));
-
-        if (sc.cmdsize != sizeof(struct symtab_command)) {
-          log_msg("Can't find symbol table!\n");
-          goto bail;
-        }
-
-        sym_buf += sc.symoff;
-        str_buf += sc.stroff;
-
-        for (j = 0; j < sc.nsyms; j++) {
-          /* Location of string is cacluated each time from the
-           * start of the string buffer.  On darwin the symbols
-           * are prefixed by "_", so we bump the pointer by 1.
-           * The target value is defined as an int in *_asm_*_offsets.c,
-           * which is 4 bytes on all targets we currently use.
-           */
-          if (bits == 32) {
-            struct nlist nl;
-            int val;
-
-            memcpy(&nl, sym_buf, sizeof(struct nlist));
-            sym_buf += sizeof(struct nlist);
-
-            memcpy(&val, base_buf + base_data_section + nl.n_value,
-                   sizeof(val));
-            print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
-          } else { /* if (bits == 64) */
-            struct nlist_64 nl;
-            int val;
-
-            memcpy(&nl, sym_buf, sizeof(struct nlist_64));
-            sym_buf += sizeof(struct nlist_64);
-
-            memcpy(&val, base_buf + base_data_section + nl.n_value,
-                   sizeof(val));
-            print_macho_equ(mode, str_buf + nl.n_un.n_strx + 1, val);
-          }
-        }
-      }
-    }
-
-    buf += lc.cmdsize;
-  }
-
-  return 0;
-bail:
-  return 1;
-
-}
-
-#elif defined(__ELF__)
-#include "elf.h"
-
-#define COPY_STRUCT(dst, buf, ofst, sz) do {\
-    if(ofst + sizeof((*(dst))) > sz) goto bail;\
-    memcpy(dst, buf+ofst, sizeof((*(dst))));\
-  } while(0)
-
-#define ENDIAN_ASSIGN(val, memb) do {\
-    if(!elf->le_data) {log_msg("Big Endian data not supported yet!\n");goto bail;}\
-    (val) = (memb);\
-  } while(0)
-
-#define ENDIAN_ASSIGN_IN_PLACE(memb) do {\
-    ENDIAN_ASSIGN(memb, memb);\
-  } while(0)
-
-typedef struct {
-  uint8_t      *buf; /* Buffer containing ELF data */
-  size_t        sz;  /* Buffer size */
-  int           le_data; /* Data is little-endian */
-  unsigned char e_ident[EI_NIDENT]; /* Magic number and other info */
-  int           bits; /* 32 or 64 */
-  Elf32_Ehdr    hdr32;
-  Elf64_Ehdr    hdr64;
-} elf_obj_t;
-
-int parse_elf_header(elf_obj_t *elf) {
-  int res;
-  /* Verify ELF Magic numbers */
-  COPY_STRUCT(&elf->e_ident, elf->buf, 0, elf->sz);
-  res = elf->e_ident[EI_MAG0] == ELFMAG0;
-  res &= elf->e_ident[EI_MAG1] == ELFMAG1;
-  res &= elf->e_ident[EI_MAG2] == ELFMAG2;
-  res &= elf->e_ident[EI_MAG3] == ELFMAG3;
-  res &= elf->e_ident[EI_CLASS] == ELFCLASS32
-         || elf->e_ident[EI_CLASS] == ELFCLASS64;
-  res &= elf->e_ident[EI_DATA] == ELFDATA2LSB;
-
-  if (!res) goto bail;
-
-  elf->le_data = elf->e_ident[EI_DATA] == ELFDATA2LSB;
-
-  /* Read in relevant values */
-  if (elf->e_ident[EI_CLASS] == ELFCLASS32) {
-    elf->bits = 32;
-    COPY_STRUCT(&elf->hdr32, elf->buf, 0, elf->sz);
-
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr32.e_shstrndx);
-  } else { /* if (elf->e_ident[EI_CLASS] == ELFCLASS64) */
-    elf->bits = 64;
-    COPY_STRUCT(&elf->hdr64, elf->buf, 0, elf->sz);
-
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_type);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_machine);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_version);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_entry);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shoff);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_flags);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_ehsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_phnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shentsize);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shnum);
-    ENDIAN_ASSIGN_IN_PLACE(elf->hdr64.e_shstrndx);
-  }
-
-  return 0;
-bail:
-  log_msg("Failed to parse ELF file header");
-  return 1;
-}
-
-int parse_elf_section(elf_obj_t *elf, int idx, Elf32_Shdr *hdr32, Elf64_Shdr *hdr64) {
-  if (hdr32) {
-    if (idx >= elf->hdr32.e_shnum)
-      goto bail;
-
-    COPY_STRUCT(hdr32, elf->buf, elf->hdr32.e_shoff + idx * elf->hdr32.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr32->sh_entsize);
-  } else { /* if (hdr64) */
-    if (idx >= elf->hdr64.e_shnum)
-      goto bail;
-
-    COPY_STRUCT(hdr64, elf->buf, elf->hdr64.e_shoff + idx * elf->hdr64.e_shentsize,
-                elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_name);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_type);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_flags);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addr);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_offset);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_size);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_link);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_info);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_addralign);
-    ENDIAN_ASSIGN_IN_PLACE(hdr64->sh_entsize);
-  }
-
-  return 0;
-bail:
-  return 1;
-}
-
-const char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
-  if (elf->bits == 32) {
-    Elf32_Shdr shdr;
-
-    if (parse_elf_section(elf, s_idx, &shdr, NULL)) {
-      log_msg("Failed to parse ELF string table: section %d, index %d\n",
-              s_idx, idx);
-      return "";
-    }
-
-    return (char *)(elf->buf + shdr.sh_offset + idx);
-  } else { /* if (elf->bits == 64) */
-    Elf64_Shdr shdr;
-
-    if (parse_elf_section(elf, s_idx, NULL, &shdr)) {
-      log_msg("Failed to parse ELF string table: section %d, index %d\n",
-              s_idx, idx);
-      return "";
-    }
-
-    return (char *)(elf->buf + shdr.sh_offset + idx);
-  }
-}
-
-int parse_elf_symbol(elf_obj_t *elf, unsigned int ofst, Elf32_Sym *sym32, Elf64_Sym *sym64) {
-  if (sym32) {
-    COPY_STRUCT(sym32, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym32->st_shndx);
-  } else { /* if (sym64) */
-    COPY_STRUCT(sym64, elf->buf, ofst, elf->sz);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_name);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_value);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_size);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_info);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_other);
-    ENDIAN_ASSIGN_IN_PLACE(sym64->st_shndx);
-  }
-  return 0;
-bail:
-  return 1;
-}
-
-int parse_elf(uint8_t *buf, size_t sz, output_fmt_t mode) {
-  elf_obj_t    elf;
-  unsigned int ofst;
-  int          i;
-  Elf32_Off    strtab_off32;
-  Elf64_Off    strtab_off64; /* save String Table offset for later use */
-
-  memset(&elf, 0, sizeof(elf));
-  elf.buf = buf;
-  elf.sz = sz;
-
-  /* Parse Header */
-  if (parse_elf_header(&elf))
-    goto bail;
-
-  if (elf.bits == 32) {
-    Elf32_Shdr shdr;
-    for (i = 0; i < elf.hdr32.e_shnum; i++) {
-      parse_elf_section(&elf, i, &shdr, NULL);
-
-      if (shdr.sh_type == SHT_STRTAB) {
-        char strtsb_name[128];
-
-        strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
-        if (!(strcmp(strtsb_name, ".shstrtab"))) {
-          /* log_msg("found section: %s\n", strtsb_name); */
-          strtab_off32 = shdr.sh_offset;
-          break;
-        }
-      }
-    }
-  } else { /* if (elf.bits == 64) */
-    Elf64_Shdr shdr;
-    for (i = 0; i < elf.hdr64.e_shnum; i++) {
-      parse_elf_section(&elf, i, NULL, &shdr);
-
-      if (shdr.sh_type == SHT_STRTAB) {
-        char strtsb_name[128];
-
-        strcpy(strtsb_name, (char *)(elf.buf + shdr.sh_offset + shdr.sh_name));
-
-        if (!(strcmp(strtsb_name, ".shstrtab"))) {
-          /* log_msg("found section: %s\n", strtsb_name); */
-          strtab_off64 = shdr.sh_offset;
-          break;
-        }
-      }
-    }
-  }
-
-  /* Parse all Symbol Tables */
-  if (elf.bits == 32) {
-    Elf32_Shdr shdr;
-    for (i = 0; i < elf.hdr32.e_shnum; i++) {
-      parse_elf_section(&elf, i, &shdr, NULL);
-
-      if (shdr.sh_type == SHT_SYMTAB) {
-        for (ofst = shdr.sh_offset;
-             ofst < shdr.sh_offset + shdr.sh_size;
-             ofst += shdr.sh_entsize) {
-          Elf32_Sym sym;
-
-          parse_elf_symbol(&elf, ofst, &sym, NULL);
-
-          /* For all OBJECTS (data objects), extract the value from the
-           * proper data segment.
-           */
-          /* if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-              log_msg("found data object %s\n",
-                      parse_elf_string_table(&elf,
-                                             shdr.sh_link,
-                                             sym.st_name));
-           */
-
-          if (ELF32_ST_TYPE(sym.st_info) == STT_OBJECT
-              && sym.st_size == 4) {
-            Elf32_Shdr dhdr;
-            int val = 0;
-            char section_name[128];
-
-            parse_elf_section(&elf, sym.st_shndx, &dhdr, NULL);
-
-            /* For explanition - refer to _MSC_VER version of code */
-            strcpy(section_name, (char *)(elf.buf + strtab_off32 + dhdr.sh_name));
-            /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
-
-            if (strcmp(section_name, ".bss")) {
-              if (sizeof(val) != sym.st_size) {
-                /* The target value is declared as an int in
-                 * *_asm_*_offsets.c, which is 4 bytes on all
-                 * targets we currently use. Complain loudly if
-                 * this is not true.
-                 */
-                log_msg("Symbol size is wrong\n");
-                goto bail;
-              }
-
-              memcpy(&val,
-                     elf.buf + dhdr.sh_offset + sym.st_value,
-                     sym.st_size);
-            }
-
-            if (!elf.le_data) {
-              log_msg("Big Endian data not supported yet!\n");
-              goto bail;
-            }
-
-            switch (mode) {
-              case OUTPUT_FMT_RVDS:
-                printf("%-40s EQU %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              case OUTPUT_FMT_GAS:
-                printf(".equ %-40s, %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              case OUTPUT_FMT_C_HEADER:
-                printf("#define %-40s %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              default:
-                printf("%s = %d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-            }
-          }
-        }
-      }
-    }
-  } else { /* if (elf.bits == 64) */
-    Elf64_Shdr shdr;
-    for (i = 0; i < elf.hdr64.e_shnum; i++) {
-      parse_elf_section(&elf, i, NULL, &shdr);
-
-      if (shdr.sh_type == SHT_SYMTAB) {
-        for (ofst = shdr.sh_offset;
-             ofst < shdr.sh_offset + shdr.sh_size;
-             ofst += shdr.sh_entsize) {
-          Elf64_Sym sym;
-
-          parse_elf_symbol(&elf, ofst, NULL, &sym);
-
-          /* For all OBJECTS (data objects), extract the value from the
-           * proper data segment.
-           */
-          /* if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT && sym.st_name)
-              log_msg("found data object %s\n",
-                      parse_elf_string_table(&elf,
-                                             shdr.sh_link,
-                                             sym.st_name));
-           */
-
-          if (ELF64_ST_TYPE(sym.st_info) == STT_OBJECT
-              && sym.st_size == 4) {
-            Elf64_Shdr dhdr;
-            int val = 0;
-            char section_name[128];
-
-            parse_elf_section(&elf, sym.st_shndx, NULL, &dhdr);
-
-            /* For explanition - refer to _MSC_VER version of code */
-            strcpy(section_name, (char *)(elf.buf + strtab_off64 + dhdr.sh_name));
-            /* log_msg("Section_name: %s, Section_type: %d\n", section_name, dhdr.sh_type); */
-
-            if ((strcmp(section_name, ".bss"))) {
-              if (sizeof(val) != sym.st_size) {
-                /* The target value is declared as an int in
-                 * *_asm_*_offsets.c, which is 4 bytes on all
-                 * targets we currently use. Complain loudly if
-                 * this is not true.
-                 */
-                log_msg("Symbol size is wrong\n");
-                goto bail;
-              }
-
-              memcpy(&val,
-                     elf.buf + dhdr.sh_offset + sym.st_value,
-                     sym.st_size);
-            }
-
-            if (!elf.le_data) {
-              log_msg("Big Endian data not supported yet!\n");
-              goto bail;
-            }
-
-            switch (mode) {
-              case OUTPUT_FMT_RVDS:
-                printf("%-40s EQU %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              case OUTPUT_FMT_GAS:
-                printf(".equ %-40s, %5d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-                break;
-              default:
-                printf("%s = %d\n",
-                       parse_elf_string_table(&elf,
-                                              shdr.sh_link,
-                                              sym.st_name),
-                       val);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  if (mode == OUTPUT_FMT_RVDS)
-    printf("    END\n");
-
-  return 0;
-bail:
-  log_msg("Parse error: File does not appear to be valid ELF32 or ELF64\n");
-  return 1;
-}
-
-#endif
-#endif /* defined(__GNUC__) && __GNUC__ */
-
-
-#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
-/*  See "Microsoft Portable Executable and Common Object File Format Specification"
-    for reference.
-*/
-#define get_le32(x) ((*(x)) | (*(x+1)) << 8 |(*(x+2)) << 16 | (*(x+3)) << 24 )
-#define get_le16(x) ((*(x)) | (*(x+1)) << 8)
-
-int parse_coff(uint8_t *buf, size_t sz) {
-  unsigned int nsections, symtab_ptr, symtab_sz, strtab_ptr;
-  unsigned int sectionrawdata_ptr;
-  unsigned int i;
-  uint8_t *ptr;
-  uint32_t symoffset;
-
-  char **sectionlist;  // this array holds all section names in their correct order.
-  // it is used to check if the symbol is in .bss or .rdata section.
-
-  nsections = get_le16(buf + 2);
-  symtab_ptr = get_le32(buf + 8);
-  symtab_sz = get_le32(buf + 12);
-  strtab_ptr = symtab_ptr + symtab_sz * 18;
-
-  if (nsections > 96) {
-    log_msg("Too many sections\n");
-    return 1;
-  }
-
-  sectionlist = malloc(nsections * sizeof(sectionlist));
-
-  if (sectionlist == NULL) {
-    log_msg("Allocating first level of section list failed\n");
-    return 1;
-  }
-
-  // log_msg("COFF: Found %u symbols in %u sections.\n", symtab_sz, nsections);
-
-  /*
-  The size of optional header is always zero for an obj file. So, the section header
-  follows the file header immediately.
-  */
-
-  ptr = buf + 20;     // section header
-
-  for (i = 0; i < nsections; i++) {
-    char sectionname[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
-    strncpy(sectionname, ptr, 8);
-    // log_msg("COFF: Parsing section %s\n",sectionname);
-
-    sectionlist[i] = malloc(strlen(sectionname) + 1);
-
-    if (sectionlist[i] == NULL) {
-      log_msg("Allocating storage for %s failed\n", sectionname);
-      goto bail;
-    }
-    strcpy(sectionlist[i], sectionname);
-
-    // check if it's .rdata and is not a COMDAT section.
-    if (!strcmp(sectionname, ".rdata") &&
-        (get_le32(ptr + 36) & 0x1000) == 0) {
-      sectionrawdata_ptr = get_le32(ptr + 20);
-    }
-
-    ptr += 40;
-  }
-
-  // log_msg("COFF: Symbol table at offset %u\n", symtab_ptr);
-  // log_msg("COFF: raw data pointer ofset for section .rdata is %u\n", sectionrawdata_ptr);
-
-  /*  The compiler puts the data with non-zero offset in .rdata section, but puts the data with
-      zero offset in .bss section. So, if the data in in .bss section, set offset=0.
-      Note from Wiki: In an object module compiled from C, the bss section contains
-      the local variables (but not functions) that were declared with the static keyword,
-      except for those with non-zero initial values. (In C, static variables are initialized
-      to zero by default.) It also contains the non-local (both extern and static) variables
-      that are also initialized to zero (either explicitly or by default).
-      */
-  // move to symbol table
-  /* COFF symbol table:
-      offset      field
-      0           Name(*)
-      8           Value
-      12          SectionNumber
-      14          Type
-      16          StorageClass
-      17          NumberOfAuxSymbols
-      */
-  ptr = buf + symtab_ptr;
-
-  for (i = 0; i < symtab_sz; i++) {
-    int16_t section = get_le16(ptr + 12); // section number
-
-    if (section > 0 && ptr[16] == 2) {
-      // if(section > 0 && ptr[16] == 3 && get_le32(ptr+8)) {
-
-      if (get_le32(ptr)) {
-        char name[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
-        strncpy(name, ptr, 8);
-        // log_msg("COFF: Parsing symbol %s\n",name);
-        /* The 64bit Windows compiler doesn't prefix with an _.
-         * Check what's there, and bump if necessary
-         */
-        if (name[0] == '_')
-          printf("%-40s EQU ", name + 1);
-        else
-          printf("%-40s EQU ", name);
-      } else {
-        // log_msg("COFF: Parsing symbol %s\n",
-        //        buf + strtab_ptr + get_le32(ptr+4));
-        if ((buf + strtab_ptr + get_le32(ptr + 4))[0] == '_')
-          printf("%-40s EQU ",
-                 buf + strtab_ptr + get_le32(ptr + 4) + 1);
-        else
-          printf("%-40s EQU ", buf + strtab_ptr + get_le32(ptr + 4));
-      }
-
-      if (!(strcmp(sectionlist[section - 1], ".bss"))) {
-        symoffset = 0;
-      } else {
-        symoffset = get_le32(buf + sectionrawdata_ptr + get_le32(ptr + 8));
-      }
-
-      // log_msg("      Section: %d\n",section);
-      // log_msg("      Class:   %d\n",ptr[16]);
-      // log_msg("      Address: %u\n",get_le32(ptr+8));
-      // log_msg("      Offset: %u\n", symoffset);
-
-      printf("%5d\n", symoffset);
-    }
-
-    ptr += 18;
-  }
-
-  printf("    END\n");
-
-  for (i = 0; i < nsections; i++) {
-    free(sectionlist[i]);
-  }
-
-  free(sectionlist);
-
-  return 0;
-bail:
-
-  for (i = 0; i < nsections; i++) {
-    free(sectionlist[i]);
-  }
-
-  free(sectionlist);
-
-  return 1;
-}
-#endif /* defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__) */
-
-int main(int argc, char **argv) {
-  output_fmt_t mode = OUTPUT_FMT_PLAIN;
-  const char *f;
-  uint8_t *file_buf;
-  int res;
-  FILE *fp;
-  long int file_size;
-
-  if (argc < 2 || argc > 3) {
-    fprintf(stderr, "Usage: %s [output format] <obj file>\n\n", argv[0]);
-    fprintf(stderr, "  <obj file>\tobject file to parse\n");
-    fprintf(stderr, "Output Formats:\n");
-    fprintf(stderr, "  gas  - compatible with GNU assembler\n");
-    fprintf(stderr, "  rvds - compatible with armasm\n");
-    fprintf(stderr, "  cheader - c/c++ header file\n");
-    goto bail;
-  }
-
-  f = argv[2];
-
-  if (!strcmp(argv[1], "rvds"))
-    mode = OUTPUT_FMT_RVDS;
-  else if (!strcmp(argv[1], "gas"))
-    mode = OUTPUT_FMT_GAS;
-  else if (!strcmp(argv[1], "cheader"))
-    mode = OUTPUT_FMT_C_HEADER;
-  else
-    f = argv[1];
-
-  fp = fopen(f, "rb");
-
-  if (!fp) {
-    perror("Unable to open file");
-    goto bail;
-  }
-
-  if (fseek(fp, 0, SEEK_END)) {
-    perror("stat");
-    goto bail;
-  }
-
-  file_size = ftell(fp);
-  file_buf = malloc(file_size);
-
-  if (!file_buf) {
-    perror("malloc");
-    goto bail;
-  }
-
-  rewind(fp);
-
-  if (fread(file_buf, sizeof(char), file_size, fp) != file_size) {
-    perror("read");
-    goto bail;
-  }
-
-  if (fclose(fp)) {
-    perror("close");
-    goto bail;
-  }
-
-#if defined(__GNUC__) && __GNUC__
-#if defined(__MACH__)
-  res = parse_macho(file_buf, file_size, mode);
-#elif defined(__ELF__)
-  res = parse_elf(file_buf, file_size, mode);
-#endif
-#endif
-#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__CYGWIN__)
-  res = parse_coff(file_buf, file_size);
-#endif
-
-  free(file_buf);
-
-  if (!res)
-    return EXIT_SUCCESS;
-
-bail:
-  return EXIT_FAILURE;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl b/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl
index 0872414cbb5..6753ee776a2 100755
--- a/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl
+++ b/chromium/third_party/libvpx/source/libvpx/build/make/rtcd.pl
@@ -376,17 +376,18 @@ if ($opts{arch} eq 'x86') {
       @ALL_ARCHS = filter("$opts{arch}", qw/dspr2/);
       last;
     }
+    if (/HAVE_MSA=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
+      last;
+    }
   }
   close CONFIG_FILE;
   mips;
-} elsif ($opts{arch} eq 'armv5te') {
-  @ALL_ARCHS = filter(qw/edsp/);
-  arm;
 } elsif ($opts{arch} eq 'armv6') {
-  @ALL_ARCHS = filter(qw/edsp media/);
+  @ALL_ARCHS = filter(qw/media/);
   arm;
-} elsif ($opts{arch} eq 'armv7') {
-  @ALL_ARCHS = filter(qw/edsp media neon_asm neon/);
+} elsif ($opts{arch} =~ /armv7\w?/) {
+  @ALL_ARCHS = filter(qw/media neon_asm neon/);
   @REQUIRES = filter(keys %required ? keys %required : qw/media/);
   &require(@REQUIRES);
   arm;
diff --git a/chromium/third_party/libvpx/source/libvpx/build/x86-msvs/obj_int_extract.bat b/chromium/third_party/libvpx/source/libvpx/build/x86-msvs/obj_int_extract.bat
deleted file mode 100644
index dfa3b908390..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/build/x86-msvs/obj_int_extract.bat
+++ /dev/null
@@ -1,15 +0,0 @@
-REM   Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-REM
-REM   Use of this source code is governed by a BSD-style license
-REM   that can be found in the LICENSE file in the root of the source
-REM   tree. An additional intellectual property rights grant can be found
-REM   in the file PATENTS.  All contributing project authors may
-REM   be found in the AUTHORS file in the root of the source tree.
-echo on
-
-REM Arguments:
-REM   %1 - Relative path to the directory containing the vp8 source directory.
-REM   %2 - Path to obj_int_extract.exe.
-cl /I. /I%1 /nologo /c "%~1/vp8/encoder/vp8_asm_enc_offsets.c"
-%2\obj_int_extract.exe rvds "vp8_asm_enc_offsets.obj" > "vp8_asm_enc_offsets.asm"
-
diff --git a/chromium/third_party/libvpx/source/libvpx/configure b/chromium/third_party/libvpx/source/libvpx/configure
index 3ed976c83bd..98542855a3e 100755
--- a/chromium/third_party/libvpx/source/libvpx/configure
+++ b/chromium/third_party/libvpx/source/libvpx/configure
@@ -26,19 +26,18 @@ Advanced options:
   ${toggle_unit_tests}            unit tests
   ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
   ${toggle_encode_perf_tests}     build encoder perf tests with unit tests
+  --cpu=CPU                       tune for the specified CPU (ARM: cortex-a8, X86: sse3)
   --libc=PATH                     path to alternate libc
   --size-limit=WxH                max size to allow in the decoder
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
   --sdk-path=PATH                 path to root of sdk (android builds only)
-  ${toggle_fast_unaligned}        don't use unaligned accesses, even when
-                                  supported by hardware [auto]
   ${toggle_codec_srcs}            in/exclude codec library source code
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
+  ${toggle_vp9_highbitdepth}      use VP9 high bit depth (10/12) profiles
   ${toggle_vp8}                   VP8 codec support
   ${toggle_vp9}                   VP9 codec support
   ${toggle_internal_stats}        output of encoder internal stats for debug, if supported (encoders)
-  ${toggle_mem_tracker}           track memory usage
   ${toggle_postproc}              postprocessing
   ${toggle_vp9_postproc}          vp9 specific postprocessing
   ${toggle_multithread}           multithreaded encoding and decoding
@@ -56,6 +55,8 @@ Advanced options:
   ${toggle_postproc_visualizer}   macro block / block level visualizers
   ${toggle_multi_res_encoding}    enable multiple-resolution encoding
   ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser
+  ${toggle_vp9_temporal_denoising}
+                                  enable vp9 temporal denoising
   ${toggle_webm_io}               enable input from and output to WebM container
   ${toggle_libyuv}                enable libyuv
 
@@ -93,10 +94,6 @@ EOF
 
 # all_platforms is a list of all supported target platforms. Maintain
 # alphabetically by architecture, generic-gnu last.
-all_platforms="${all_platforms} armv5te-android-gcc"
-all_platforms="${all_platforms} armv5te-linux-rvct"
-all_platforms="${all_platforms} armv5te-linux-gcc"
-all_platforms="${all_platforms} armv5te-none-rvct"
 all_platforms="${all_platforms} armv6-darwin-gcc"
 all_platforms="${all_platforms} armv6-linux-rvct"
 all_platforms="${all_platforms} armv6-linux-gcc"
@@ -112,12 +109,6 @@ all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc32-darwin8-gcc"
-all_platforms="${all_platforms} ppc32-darwin9-gcc"
-all_platforms="${all_platforms} ppc32-linux-gcc"
-all_platforms="${all_platforms} ppc64-darwin8-gcc"
-all_platforms="${all_platforms} ppc64-darwin9-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -128,6 +119,7 @@ all_platforms="${all_platforms} x86-darwin10-gcc"
 all_platforms="${all_platforms} x86-darwin11-gcc"
 all_platforms="${all_platforms} x86-darwin12-gcc"
 all_platforms="${all_platforms} x86-darwin13-gcc"
+all_platforms="${all_platforms} x86-darwin14-gcc"
 all_platforms="${all_platforms} x86-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86-linux-gcc"
 all_platforms="${all_platforms} x86-linux-icc"
@@ -145,6 +137,7 @@ all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
 all_platforms="${all_platforms} x86_64-darwin12-gcc"
 all_platforms="${all_platforms} x86_64-darwin13-gcc"
+all_platforms="${all_platforms} x86_64-darwin14-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -161,6 +154,7 @@ all_platforms="${all_platforms} universal-darwin10-gcc"
 all_platforms="${all_platforms} universal-darwin11-gcc"
 all_platforms="${all_platforms} universal-darwin12-gcc"
 all_platforms="${all_platforms} universal-darwin13-gcc"
+all_platforms="${all_platforms} universal-darwin14-gcc"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
@@ -206,7 +200,7 @@ enable_feature install_libs
 
 enable_feature static
 enable_feature optimizations
-enable_feature fast_unaligned #allow unaligned accesses, if supported by hw
+enable_feature dependency_tracking
 enable_feature spatial_resampling
 enable_feature multithread
 enable_feature os_support
@@ -243,8 +237,6 @@ ARCH_LIST="
     mips
     x86
     x86_64
-    ppc32
-    ppc64
 "
 ARCH_EXT_LIST="
     edsp
@@ -254,7 +246,7 @@ ARCH_EXT_LIST="
 
     mips32
     dspr2
-
+    msa
     mips64
 
     mmx
@@ -265,8 +257,6 @@ ARCH_EXT_LIST="
     sse4_1
     avx
     avx2
-
-    altivec
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
@@ -279,11 +269,11 @@ HAVE_LIST="
 "
 EXPERIMENT_LIST="
     spatial_svc
-    vp9_temporal_denoising
     fp_mb_stats
     emulate_hardware
 "
 CONFIG_LIST="
+    dependency_tracking
     external_build
     install_docs
     install_bins
@@ -301,10 +291,6 @@ CONFIG_LIST="
 
     codec_srcs
     debug_libs
-    fast_unaligned
-    mem_manager
-    mem_tracker
-    mem_checks
 
     dequant_tokens
     dc_recon
@@ -334,6 +320,7 @@ CONFIG_LIST="
     encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    vp9_temporal_denoising
     coefficient_range_checking
     vp9_highbitdepth
     experimental
@@ -341,6 +328,7 @@ CONFIG_LIST="
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
+    dependency_tracking
     external_build
     extra_warnings
     werror
@@ -364,7 +352,6 @@ CMDLINE_SELECT="
     libc
     as
     size_limit
-    fast_unaligned
     codec_srcs
     debug_libs
 
@@ -377,7 +364,6 @@ CMDLINE_SELECT="
     ${CODECS}
     ${CODEC_FAMILIES}
     static_msvcrt
-    mem_tracker
     spatial_resampling
     realtime_only
     onthefly_bitpacking
@@ -393,6 +379,7 @@ CMDLINE_SELECT="
     encode_perf_tests
     multi_res_encoding
     temporal_denoising
+    vp9_temporal_denoising
     coefficient_range_checking
     vp9_highbitdepth
     experimental
@@ -451,8 +438,6 @@ process_targets() {
     enabled child || write_common_config_banner
     enabled universal || write_common_target_config_h  ${BUILD_PFX}vpx_config.h
 
-    # TODO: add host tools target (obj_int_extract, etc)
-
     # For fat binaries, call configure recursively to configure for each
     # binary architecture to be included.
     if enabled universal; then
@@ -616,12 +601,6 @@ process_toolchain() {
         universal-darwin*)
             darwin_ver=${tgt_os##darwin}
 
-            # Snow Leopard (10.6/darwin10) dropped support for PPC
-            # Include PPC support for all prior versions
-            if [ $darwin_ver -lt 10 ]; then
-                fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"
-            fi
-
             # Tiger (10.4/darwin8) brought support for x86
             if [ $darwin_ver -ge 8 ]; then
                 fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
@@ -722,7 +701,7 @@ process_toolchain() {
     esac
 
     # Other toolchain specific defaults
-    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+    case $toolchain in x86*|universal*) soft_enable postproc;; esac
 
     if enabled postproc_visualizer; then
         enabled postproc || die "postproc_visualizer requires postproc to be enabled"
diff --git a/chromium/third_party/libvpx/source/libvpx/examples.mk b/chromium/third_party/libvpx/source/libvpx/examples.mk
index fd67a44dfe9..4ff1de4eeae 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples.mk
+++ b/chromium/third_party/libvpx/source/libvpx/examples.mk
@@ -338,6 +338,7 @@ $(foreach proj,$(call enabled,PROJECTS),\
 #
 %.dox: %.c
 	@echo "    [DOXY] $@"
+	@mkdir -p $(dir $@)
 	@echo "/*!\page example_$(@F:.dox=) $(@F:.dox=)" > $@
 	@echo "   \includelineno $(<F)" >> $@
 	@echo "*/" >> $@
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c b/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c
index fbc0f4a6f19..a3843bed336 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/decode_to_md5.c
@@ -36,9 +36,9 @@
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
-#include "./md5_utils.h"
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../md5_utils.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static void get_image_md5(const vpx_image_t *img, unsigned char digest[16]) {
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c b/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c
index 9423e38ffac..36f7d80e127 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/decode_with_drops.c
@@ -59,8 +59,8 @@
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/postproc.c b/chromium/third_party/libvpx/source/libvpx/examples/postproc.c
index c74347c4c71..e34426a6194 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/postproc.c
@@ -46,8 +46,8 @@
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c b/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c
index b068f552405..f8c35255fa2 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/resize_util.c
@@ -15,15 +15,22 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./vp9/encoder/vp9_resize.h"
+#include "../vp9/encoder/vp9_resize.h"
 
-static void usage(char *progname) {
+static const char *exec_name = NULL;
+
+static void usage() {
   printf("Usage:\n");
   printf("%s <input_yuv> <width>x<height> <target_width>x<target_height> ",
-         progname);
+         exec_name);
   printf("<output_yuv> [<frames>]\n");
 }
 
+void usage_exit() {
+  usage();
+  exit(EXIT_FAILURE);
+}
+
 static int parse_dim(char *v, int *width, int *height) {
   char *x = strchr(v, 'x');
   if (x == NULL)
@@ -47,9 +54,11 @@ int main(int argc, char *argv[]) {
   int f, frames;
   int width, height, target_width, target_height;
 
+  exec_name = argv[0];
+
   if (argc < 5) {
     printf("Incorrect parameters:\n");
-    usage(argv[0]);
+    usage();
     return 1;
   }
 
@@ -57,25 +66,25 @@ int main(int argc, char *argv[]) {
   fout = argv[4];
   if (!parse_dim(argv[2], &width, &height)) {
     printf("Incorrect parameters: %s\n", argv[2]);
-    usage(argv[0]);
+    usage();
     return 1;
   }
   if (!parse_dim(argv[3], &target_width, &target_height)) {
     printf("Incorrect parameters: %s\n", argv[3]);
-    usage(argv[0]);
+    usage();
     return 1;
   }
 
   fpin = fopen(fin, "rb");
   if (fpin == NULL) {
     printf("Can't open file %s to read\n", fin);
-    usage(argv[0]);
+    usage();
     return 1;
   }
   fpout = fopen(fout, "wb");
   if (fpout == NULL) {
     printf("Can't open file %s to write\n", fout);
-    usage(argv[0]);
+    usage();
     return 1;
   }
   if (argc >= 6)
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c b/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c
index 851adc42ea1..5555baac22e 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/set_maps.c
@@ -50,8 +50,8 @@
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c b/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c
index c58b014f76a..08a21668542 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/simple_decoder.c
@@ -82,8 +82,8 @@
 
 #include "vpx/vpx_decoder.h"
 
-#include "./tools_common.h"
-#include "./video_reader.h"
+#include "../tools_common.h"
+#include "../video_reader.h"
 #include "./vpx_config.h"
 
 static const char *exec_name;
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c
index f20c246daf2..e805c258747 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/simple_encoder.c
@@ -101,8 +101,8 @@
 
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c
index 653ae948224..0ec83ddccdf 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/twopass_encoder.c
@@ -53,8 +53,8 @@
 
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
index 9f50dc7cf52..e623567b8fe 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -8,292 +8,730 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+/*
+ * This is an example demonstrating multi-resolution encoding in VP8.
+ * High-resolution input video is down-sampled to lower-resolutions. The
+ * encoder then encodes the video and outputs multiple bitstreams with
+ * different resolutions.
+ *
+ * This test also allows for settings temporal layers for each spatial layer.
+ * Different number of temporal layers per spatial stream may be used.
+ * Currently up to 3 temporal layers per spatial stream (encoder) are supported
+ * in this test.
+ */
 
-// This is an example demonstrating multi-resolution encoding in VP8.
-// High-resolution input video is down-sampled to lower-resolutions. The
-// encoder then encodes the video and outputs multiple bitstreams with
-// different resolutions.
-//
-// Configure with --enable-multi-res-encoding flag to enable this example.
+#include "./vpx_config.h"
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdarg.h>
 #include <string.h>
+#include <math.h>
+#include <assert.h>
+#include <sys/time.h>
+#if USE_POSIX_MMAP
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+#include "vpx_ports/vpx_timer.h"
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vpx_ports/mem_ops.h"
+#include "./tools_common.h"
+#define interface (vpx_codec_vp8_cx())
+#define fourcc    0x30385056
+
+void usage_exit() {
+  exit(EXIT_FAILURE);
+}
+
+/*
+ * The input video frame is downsampled several times to generate a multi-level
+ * hierarchical structure. NUM_ENCODERS is defined as the number of encoding
+ * levels required. For example, if the size of input video is 1280x720,
+ * NUM_ENCODERS is 3, and down-sampling factor is 2, the encoder outputs 3
+ * bitstreams with resolution of 1280x720(level 0), 640x360(level 1), and
+ * 320x180(level 2) respectively.
+ */
+
+/* Number of encoders (spatial resolutions) used in this test. */
+#define NUM_ENCODERS 3
+
+/* Maximum number of temporal layers allowed for this test. */
+#define MAX_NUM_TEMPORAL_LAYERS 3
 
+/* This example uses the scaler function in libyuv. */
 #include "third_party/libyuv/include/libyuv/basic_types.h"
 #include "third_party/libyuv/include/libyuv/scale.h"
 #include "third_party/libyuv/include/libyuv/cpu_id.h"
 
-#include "vpx/vpx_encoder.h"
-#include "vpx/vp8cx.h"
+int (*read_frame_p)(FILE *f, vpx_image_t *img);
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+static int read_frame(FILE *f, vpx_image_t *img) {
+    size_t nbytes, to_read;
+    int    res = 1;
 
-// The input video frame is downsampled several times to generate a
-// multi-level  hierarchical structure. kNumEncoders is defined as the number
-// of encoding  levels required. For example, if the size of input video is
-// 1280x720, kNumEncoders is 3, and down-sampling factor is 2, the encoder
-// outputs 3 bitstreams with resolution of 1280x720(level 0),
-// 640x360(level 1), and 320x180(level 2) respectively.
-#define kNumEncoders 3
+    to_read = img->w*img->h*3/2;
+    nbytes = fread(img->planes[0], 1, to_read, f);
+    if(nbytes != to_read) {
+        res = 0;
+        if(nbytes > 0)
+            printf("Warning: Read partial frame. Check your width & height!\n");
+    }
+    return res;
+}
 
-static const char *exec_name;
+static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+    size_t nbytes, to_read;
+    int    res = 1;
+    int plane;
 
-void usage_exit() {
-  fprintf(stderr,
-          "Usage: %s <width> <height> <infile> <outfile(s)> <output psnr?>\n",
-          exec_name);
-  exit(EXIT_FAILURE);
+    for (plane = 0; plane < 3; plane++)
+    {
+        unsigned char *ptr;
+        int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
+        int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
+        int r;
+
+        /* Determine the correct plane based on the image format. The for-loop
+         * always counts in Y,U,V order, but this may not match the order of
+         * the data on disk.
+         */
+        switch (plane)
+        {
+        case 1:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12? VPX_PLANE_V : VPX_PLANE_U];
+            break;
+        case 2:
+            ptr = img->planes[img->fmt==VPX_IMG_FMT_YV12?VPX_PLANE_U : VPX_PLANE_V];
+            break;
+        default:
+            ptr = img->planes[plane];
+        }
+
+        for (r = 0; r < h; r++)
+        {
+            to_read = w;
+
+            nbytes = fread(ptr, 1, to_read, f);
+            if(nbytes != to_read) {
+                res = 0;
+                if(nbytes > 0)
+                    printf("Warning: Read partial frame. Check your width & height!\n");
+                break;
+            }
+
+            ptr += img->stride[plane];
+        }
+        if (!res)
+            break;
+    }
+
+    return res;
 }
 
-int main(int argc, char *argv[]) {
-  int frame_cnt = 0;
-  FILE *infile = NULL;
-  VpxVideoWriter *writers[kNumEncoders];
-  vpx_codec_ctx_t codec[kNumEncoders];
-  vpx_codec_enc_cfg_t cfg[kNumEncoders];
-  vpx_image_t raw[kNumEncoders];
-  const VpxInterface *const encoder = get_vpx_encoder_by_name("vp8");
-  // Currently, only realtime mode is supported in multi-resolution encoding.
-  const int arg_deadline = VPX_DL_REALTIME;
-  int i;
-  int width = 0;
-  int height = 0;
-  int frame_avail = 0;
-  int got_data = 0;
-
-  // Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
-  // don't need to know PSNR, which will skip PSNR calculation and save
-  // encoding time.
-  int show_psnr = 0;
-  uint64_t psnr_sse_total[kNumEncoders] = {0};
-  uint64_t psnr_samples_total[kNumEncoders] = {0};
-  double psnr_totals[kNumEncoders][4] = {{0, 0}};
-  int psnr_count[kNumEncoders] = {0};
-
-  // Set the required target bitrates for each resolution level.
-  // If target bitrate for highest-resolution level is set to 0,
-  // (i.e. target_bitrate[0]=0), we skip encoding at that level.
-  unsigned int target_bitrate[kNumEncoders] = {1000, 500, 100};
-
-  // Enter the frame rate of the input video.
-  const int framerate = 30;
-  // Set down-sampling factor for each resolution level.
-  //   dsf[0] controls down sampling from level 0 to level 1;
-  //   dsf[1] controls down sampling from level 1 to level 2;
-  //   dsf[2] is not used.
-  vpx_rational_t dsf[kNumEncoders] = {{2, 1}, {2, 1}, {1, 1}};
-
-  exec_name = argv[0];
-
-  if (!encoder)
-    die("Unsupported codec.");
-
-  // exe_name, input width, input height, input file,
-  // output file 1, output file 2, output file 3, psnr on/off
-  if (argc != (5 + kNumEncoders))
-    die("Invalid number of input options.");
-
-  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
-
-  width = strtol(argv[1], NULL, 0);
-  height = strtol(argv[2], NULL, 0);
-
-  if (width < 16 || width % 2 || height < 16 || height % 2)
-    die("Invalid resolution: %ldx%ld", width, height);
-
-  // Open input video file for encoding
-  if (!(infile = fopen(argv[3], "rb")))
-    die("Failed to open %s for reading", argv[3]);
-
-  show_psnr = strtol(argv[kNumEncoders + 4], NULL, 0);
-
-  // Populate default encoder configuration
-  for (i = 0; i < kNumEncoders; ++i) {
-    vpx_codec_err_t res =
-        vpx_codec_enc_config_default(encoder->codec_interface(), &cfg[i], 0);
-    if (res != VPX_CODEC_OK) {
-      printf("Failed to get config: %s\n", vpx_codec_err_to_string(res));
-      return EXIT_FAILURE;
+static void write_ivf_file_header(FILE *outfile,
+                                  const vpx_codec_enc_cfg_t *cfg,
+                                  int frame_cnt) {
+    char header[32];
+
+    if(cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
+        return;
+    header[0] = 'D';
+    header[1] = 'K';
+    header[2] = 'I';
+    header[3] = 'F';
+    mem_put_le16(header+4,  0);                   /* version */
+    mem_put_le16(header+6,  32);                  /* headersize */
+    mem_put_le32(header+8,  fourcc);              /* headersize */
+    mem_put_le16(header+12, cfg->g_w);            /* width */
+    mem_put_le16(header+14, cfg->g_h);            /* height */
+    mem_put_le32(header+16, cfg->g_timebase.den); /* rate */
+    mem_put_le32(header+20, cfg->g_timebase.num); /* scale */
+    mem_put_le32(header+24, frame_cnt);           /* length */
+    mem_put_le32(header+28, 0);                   /* unused */
+
+    (void) fwrite(header, 1, 32, outfile);
+}
+
+static void write_ivf_frame_header(FILE *outfile,
+                                   const vpx_codec_cx_pkt_t *pkt)
+{
+    char             header[12];
+    vpx_codec_pts_t  pts;
+
+    if(pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+        return;
+
+    pts = pkt->data.frame.pts;
+    mem_put_le32(header, pkt->data.frame.sz);
+    mem_put_le32(header+4, pts&0xFFFFFFFF);
+    mem_put_le32(header+8, pts >> 32);
+
+    (void) fwrite(header, 1, 12, outfile);
+}
+
+/* Temporal scaling parameters */
+/* This sets all the temporal layer parameters given |num_temporal_layers|,
+ * including the target bit allocation across temporal layers. Bit allocation
+ * parameters will be passed in as user parameters in another version.
+ */
+static void set_temporal_layer_pattern(int num_temporal_layers,
+                                       vpx_codec_enc_cfg_t *cfg,
+                                       int bitrate,
+                                       int *layer_flags)
+{
+    assert(num_temporal_layers <= MAX_NUM_TEMPORAL_LAYERS);
+    switch (num_temporal_layers)
+    {
+    case 1:
+    {
+        /* 1-layer */
+        cfg->ts_number_layers     = 1;
+        cfg->ts_periodicity       = 1;
+        cfg->ts_rate_decimator[0] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_target_bitrate[0] = bitrate;
+
+        // Update L only.
+        layer_flags[0] = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+        break;
+    }
+
+    case 2:
+    {
+        /* 2-layers, with sync point at first frame of layer 1. */
+        cfg->ts_number_layers     = 2;
+        cfg->ts_periodicity       = 2;
+        cfg->ts_rate_decimator[0] = 2;
+        cfg->ts_rate_decimator[1] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_layer_id[1] = 1;
+        // Use 60/40 bit allocation as example.
+        cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+        cfg->ts_target_bitrate[1] = bitrate;
+
+        /* 0=L, 1=GF */
+        // ARF is used as predictor for all frames, and is only updated on
+        // key frame. Sync point every 8 frames.
+
+        // Layer 0: predict from L and ARF, update L and G.
+        layer_flags[0] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 1: sync point: predict from L and ARF, and update G.
+        layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 0, predict from L and ARF, update L.
+        layer_flags[2] = VP8_EFLAG_NO_REF_GF  |
+                         VP8_EFLAG_NO_UPD_GF  |
+                         VP8_EFLAG_NO_UPD_ARF;
+
+        // Layer 1: predict from L, G and ARF, and update G.
+        layer_flags[3] = VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 0
+        layer_flags[4] = layer_flags[2];
+
+        // Layer 1
+        layer_flags[5] = layer_flags[3];
+
+        // Layer 0
+        layer_flags[6] = layer_flags[4];
+
+        // Layer 1
+        layer_flags[7] = layer_flags[5];
+        break;
     }
-  }
-
-  // Update the default configuration according to needs of the application.
-  // Highest-resolution encoder settings
-  cfg[0].g_w = width;
-  cfg[0].g_h = height;
-  cfg[0].g_threads = 1;
-  cfg[0].rc_dropframe_thresh = 30;
-  cfg[0].rc_end_usage = VPX_CBR;
-  cfg[0].rc_resize_allowed = 0;
-  cfg[0].rc_min_quantizer = 4;
-  cfg[0].rc_max_quantizer = 56;
-  cfg[0].rc_undershoot_pct = 98;
-  cfg[0].rc_overshoot_pct = 100;
-  cfg[0].rc_buf_initial_sz = 500;
-  cfg[0].rc_buf_optimal_sz = 600;
-  cfg[0].rc_buf_sz = 1000;
-  cfg[0].g_error_resilient = 1;
-  cfg[0].g_lag_in_frames = 0;
-  cfg[0].kf_mode = VPX_KF_AUTO;  // VPX_KF_DISABLED
-  cfg[0].kf_min_dist = 3000;
-  cfg[0].kf_max_dist = 3000;
-  cfg[0].rc_target_bitrate = target_bitrate[0];
-  cfg[0].g_timebase.num = 1;
-  cfg[0].g_timebase.den = framerate;
-
-  // Other-resolution encoder settings
-  for (i = 1; i < kNumEncoders; ++i) {
-    cfg[i] = cfg[0];
-    cfg[i].g_threads = 1;
-    cfg[i].rc_target_bitrate = target_bitrate[i];
-
-    // Note: Width & height of other-resolution encoders are calculated
-    // from the highest-resolution encoder's size and the corresponding
-    // down_sampling_factor.
+
+    case 3:
+    default:
     {
-      unsigned int iw = cfg[i - 1].g_w * dsf[i - 1].den + dsf[i - 1].num - 1;
-      unsigned int ih = cfg[i - 1].g_h * dsf[i - 1].den + dsf[i - 1].num - 1;
-      cfg[i].g_w = iw / dsf[i - 1].num;
-      cfg[i].g_h = ih / dsf[i - 1].num;
+        // 3-layers structure where ARF is used as predictor for all frames,
+        // and is only updated on key frame.
+        // Sync points for layer 1 and 2 every 8 frames.
+        cfg->ts_number_layers     = 3;
+        cfg->ts_periodicity       = 4;
+        cfg->ts_rate_decimator[0] = 4;
+        cfg->ts_rate_decimator[1] = 2;
+        cfg->ts_rate_decimator[2] = 1;
+        cfg->ts_layer_id[0] = 0;
+        cfg->ts_layer_id[1] = 2;
+        cfg->ts_layer_id[2] = 1;
+        cfg->ts_layer_id[3] = 2;
+        // Use 40/20/40 bit allocation as example.
+        cfg->ts_target_bitrate[0] = 0.4f * bitrate;
+        cfg->ts_target_bitrate[1] = 0.6f * bitrate;
+        cfg->ts_target_bitrate[2] = bitrate;
+
+        /* 0=L, 1=GF, 2=ARF */
+
+        // Layer 0: predict from L and ARF; update L and G.
+        layer_flags[0] =  VP8_EFLAG_NO_UPD_ARF |
+                          VP8_EFLAG_NO_REF_GF;
+
+        // Layer 2: sync point: predict from L and ARF; update none.
+        layer_flags[1] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 1: sync point: predict from L and ARF; update G.
+        layer_flags[2] = VP8_EFLAG_NO_REF_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[3] = VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST |
+                         VP8_EFLAG_NO_UPD_ENTROPY;
+
+        // Layer 0: predict from L and ARF; update L.
+        layer_flags[4] = VP8_EFLAG_NO_UPD_GF |
+                         VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_REF_GF;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[5] = layer_flags[3];
+
+        // Layer 1: predict from L, G, ARF; update G.
+        layer_flags[6] = VP8_EFLAG_NO_UPD_ARF |
+                         VP8_EFLAG_NO_UPD_LAST;
+
+        // Layer 2: predict from L, G, ARF; update none.
+        layer_flags[7] = layer_flags[3];
+        break;
+    }
     }
+}
+
+/* The periodicity of the pattern given the number of temporal layers. */
+static int periodicity_to_num_layers[MAX_NUM_TEMPORAL_LAYERS] = {1, 8, 8};
+
+int main(int argc, char **argv)
+{
+    FILE                 *infile, *outfile[NUM_ENCODERS];
+    FILE                 *downsampled_input[NUM_ENCODERS - 1];
+    char                 filename[50];
+    vpx_codec_ctx_t      codec[NUM_ENCODERS];
+    vpx_codec_enc_cfg_t  cfg[NUM_ENCODERS];
+    int                  frame_cnt = 0;
+    vpx_image_t          raw[NUM_ENCODERS];
+    vpx_codec_err_t      res[NUM_ENCODERS];
+
+    int                  i;
+    long                 width;
+    long                 height;
+    int                  length_frame;
+    int                  frame_avail;
+    int                  got_data;
+    int                  flags = 0;
+    int                  layer_id = 0;
+
+    int                  layer_flags[VPX_TS_MAX_PERIODICITY * NUM_ENCODERS]
+                                     = {0};
+    int                  flag_periodicity;
+
+    /*Currently, only realtime mode is supported in multi-resolution encoding.*/
+    int                  arg_deadline = VPX_DL_REALTIME;
+
+    /* Set show_psnr to 1/0 to show/not show PSNR. Choose show_psnr=0 if you
+       don't need to know PSNR, which will skip PSNR calculation and save
+       encoding time. */
+    int                  show_psnr = 0;
+    int                  key_frame_insert = 0;
+    uint64_t             psnr_sse_total[NUM_ENCODERS] = {0};
+    uint64_t             psnr_samples_total[NUM_ENCODERS] = {0};
+    double               psnr_totals[NUM_ENCODERS][4] = {{0,0}};
+    int                  psnr_count[NUM_ENCODERS] = {0};
+
+    double               cx_time = 0;
+    struct  timeval      tv1, tv2, difftv;
+
+    /* Set the required target bitrates for each resolution level.
+     * If target bitrate for highest-resolution level is set to 0,
+     * (i.e. target_bitrate[0]=0), we skip encoding at that level.
+     */
+    unsigned int         target_bitrate[NUM_ENCODERS]={1000, 500, 100};
+
+    /* Enter the frame rate of the input video */
+    int                  framerate = 30;
+
+    /* Set down-sampling factor for each resolution level.
+       dsf[0] controls down sampling from level 0 to level 1;
+       dsf[1] controls down sampling from level 1 to level 2;
+       dsf[2] is not used. */
+    vpx_rational_t dsf[NUM_ENCODERS] = {{2, 1}, {2, 1}, {1, 1}};
+
+    /* Set the number of temporal layers for each encoder/resolution level,
+     * starting from highest resoln down to lowest resoln. */
+    unsigned int         num_temporal_layers[NUM_ENCODERS] = {3, 3, 3};
+
+    if(argc!= (7 + 3 * NUM_ENCODERS))
+        die("Usage: %s <width> <height> <frame_rate>  <infile> <outfile(s)> "
+            "<rate_encoder(s)> <temporal_layer(s)> <key_frame_insert> <output psnr?> \n",
+            argv[0]);
+
+    printf("Using %s\n",vpx_codec_iface_name(interface));
+
+    width = strtol(argv[1], NULL, 0);
+    height = strtol(argv[2], NULL, 0);
+    framerate = strtol(argv[3], NULL, 0);
+
+    if(width < 16 || width%2 || height <16 || height%2)
+        die("Invalid resolution: %ldx%ld", width, height);
+
+    /* Open input video file for encoding */
+    if(!(infile = fopen(argv[4], "rb")))
+        die("Failed to open %s for reading", argv[4]);
+
+    /* Open output file for each encoder to output bitstreams */
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        if(!target_bitrate[i])
+        {
+            outfile[i] = NULL;
+            continue;
+        }
 
-    // Make width & height to be multiplier of 2.
-    if ((cfg[i].g_w) % 2)
-      cfg[i].g_w++;
-
-    if ((cfg[i].g_h) % 2)
-      cfg[i].g_h++;
-  }
-
-  // Open output file for each encoder to output bitstreams
-  for (i = 0; i < kNumEncoders; ++i) {
-    VpxVideoInfo info = {
-      encoder->fourcc,
-      cfg[i].g_w,
-      cfg[i].g_h,
-      {cfg[i].g_timebase.num, cfg[i].g_timebase.den}
-    };
-
-    if (!(writers[i] = vpx_video_writer_open(argv[i+4], kContainerIVF, &info)))
-      die("Failed to open %s for writing", argv[i+4]);
-  }
-
-  // Allocate image for each encoder
-  for (i = 0; i < kNumEncoders; ++i)
-    if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
-      die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
-
-  // Initialize multi-encoder
-  if (vpx_codec_enc_init_multi(&codec[0], encoder->codec_interface(), &cfg[0],
-                               kNumEncoders,
-                               show_psnr ? VPX_CODEC_USE_PSNR : 0, &dsf[0]))
-    die_codec(&codec[0], "Failed to initialize encoder");
-
-  // The extra encoding configuration parameters can be set as follows.
-  for (i = 0; i < kNumEncoders; i++) {
-    // Set encoding speed
-    if (vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, -6))
-      die_codec(&codec[i], "Failed to set cpu_used");
-
-    // Set static threshold.
-    if (vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
-      die_codec(&codec[i], "Failed to set static threshold");
-
-    // Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING
-    // Enable denoising for the highest-resolution encoder.
-    if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, i == 0))
-      die_codec(&codec[0], "Failed to set noise_sensitivity");
-  }
-
-  frame_avail = 1;
-  got_data = 0;
-
-  while (frame_avail || got_data) {
-    vpx_codec_iter_t iter[kNumEncoders] = {NULL};
-    const vpx_codec_cx_pkt_t *pkt[kNumEncoders];
-
-    frame_avail = vpx_img_read(&raw[0], infile);
-
-    if (frame_avail) {
-      for (i = 1; i < kNumEncoders; ++i) {
-        vpx_image_t *const prev = &raw[i - 1];
-
-        // Scale the image down a number of times by downsampling factor
-        // FilterMode 1 or 2 give better psnr than FilterMode 0.
-        I420Scale(prev->planes[VPX_PLANE_Y], prev->stride[VPX_PLANE_Y],
-                  prev->planes[VPX_PLANE_U], prev->stride[VPX_PLANE_U],
-                  prev->planes[VPX_PLANE_V], prev->stride[VPX_PLANE_V],
-                  prev->d_w, prev->d_h,
-                  raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
-                  raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
-                  raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
-                  raw[i].d_w, raw[i].d_h, 1);
-      }
+        if(!(outfile[i] = fopen(argv[i+5], "wb")))
+            die("Failed to open %s for writing", argv[i+4]);
     }
 
-    // Encode frame.
-    if (vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
-                         frame_cnt, 1, 0, arg_deadline)) {
-      die_codec(&codec[0], "Failed to encode frame");
+    // Bitrates per spatial layer: overwrite default rates above.
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
     }
 
-    for (i = kNumEncoders - 1; i >= 0; i--) {
-      got_data = 0;
-
-      while ((pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i]))) {
-        got_data = 1;
-        switch (pkt[i]->kind) {
-          case VPX_CODEC_CX_FRAME_PKT:
-            vpx_video_writer_write_frame(writers[i], pkt[i]->data.frame.buf,
-                                         pkt[i]->data.frame.sz, frame_cnt - 1);
-          break;
-          case VPX_CODEC_PSNR_PKT:
-            if (show_psnr) {
-              int j;
-              psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
-              psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
-              for (j = 0; j < 4; j++)
-                psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
-              psnr_count[i]++;
-            }
-            break;
-          default:
-            break;
+    // Temporal layers per spatial layers: overwrite default settings above.
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+        if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
+          die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
+              num_temporal_layers);
+    }
+
+    /* Open file to write out each spatially downsampled input stream. */
+    for (i=0; i< NUM_ENCODERS - 1; i++)
+    {
+       // Highest resoln is encoder 0.
+        if (sprintf(filename,"ds%d.yuv",NUM_ENCODERS - i) < 0)
+        {
+            return EXIT_FAILURE;
+        }
+        downsampled_input[i] = fopen(filename,"wb");
+    }
+
+    key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+
+    show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+
+
+    /* Populate default encoder configuration */
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        res[i] = vpx_codec_enc_config_default(interface, &cfg[i], 0);
+        if(res[i]) {
+            printf("Failed to get config: %s\n", vpx_codec_err_to_string(res[i]));
+            return EXIT_FAILURE;
         }
-        printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
-               (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":".");
-        fflush(stdout);
-      }
     }
-    frame_cnt++;
-  }
-  printf("\n");
-
-  fclose(infile);
-
-  printf("Processed %d frames.\n", frame_cnt - 1);
-  for (i = 0; i < kNumEncoders; ++i) {
-    // Calculate PSNR and print it out
-    if (show_psnr && psnr_count[i] > 0) {
-      int j;
-      double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
-                                  psnr_sse_total[i]);
-
-      fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
-      fprintf(stderr, " %.3lf", ovpsnr);
-      for (j = 0; j < 4; j++)
-        fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
+
+    /*
+     * Update the default configuration according to needs of the application.
+     */
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1;              /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames   = 0;
+
+    /* Disable automatic keyframe placement */
+    /* Note: These 3 settings are copied to all levels. But, except the lowest
+     * resolution level, all other levels are set to VPX_KF_DISABLED internally.
+     */
+    cfg[0].kf_mode           = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate[0];       /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                          /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    /* Other-resolution encoder settings */
+    for (i=1; i< NUM_ENCODERS; i++)
+    {
+        memcpy(&cfg[i], &cfg[0], sizeof(vpx_codec_enc_cfg_t));
+
+        cfg[i].rc_target_bitrate = target_bitrate[i];
+
+        /* Note: Width & height of other-resolution encoders are calculated
+         * from the highest-resolution encoder's size and the corresponding
+         * down_sampling_factor.
+         */
+        {
+            unsigned int iw = cfg[i-1].g_w*dsf[i-1].den + dsf[i-1].num - 1;
+            unsigned int ih = cfg[i-1].g_h*dsf[i-1].den + dsf[i-1].num - 1;
+            cfg[i].g_w = iw/dsf[i-1].num;
+            cfg[i].g_h = ih/dsf[i-1].num;
+        }
+
+        /* Make width & height to be multiplier of 2. */
+        // Should support odd size ???
+        if((cfg[i].g_w)%2)cfg[i].g_w++;
+        if((cfg[i].g_h)%2)cfg[i].g_h++;
+    }
+
+
+    // Set the number of threads per encode/spatial layer.
+    // (1, 1, 1) means no encoder threading.
+    cfg[0].g_threads = 2;
+    cfg[1].g_threads = 1;
+    cfg[2].g_threads = 1;
+
+    /* Allocate image for each encoder */
+    for (i=0; i< NUM_ENCODERS; i++)
+        if(!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
+            die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+
+    if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+        read_frame_p = read_frame;
+    else
+        read_frame_p = read_frame_by_row;
+
+    for (i=0; i< NUM_ENCODERS; i++)
+        if(outfile[i])
+            write_ivf_file_header(outfile[i], &cfg[i], 0);
+
+    /* Temporal layers settings */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        set_temporal_layer_pattern(num_temporal_layers[i],
+                                   &cfg[i],
+                                   cfg[i].rc_target_bitrate,
+                                   &layer_flags[i * VPX_TS_MAX_PERIODICITY]);
+    }
+
+    /* Initialize multi-encoder */
+    if(vpx_codec_enc_init_multi(&codec[0], interface, &cfg[0], NUM_ENCODERS,
+                                (show_psnr ? VPX_CODEC_USE_PSNR : 0), &dsf[0]))
+        die_codec(&codec[0], "Failed to initialize encoder");
+
+    /* The extra encoding configuration parameters can be set as follows. */
+    /* Set encoding speed */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        int speed = -6;
+        /* Lower speed for the lowest resolution. */
+        if (i == NUM_ENCODERS - 1) speed = -4;
+        if(vpx_codec_control(&codec[i], VP8E_SET_CPUUSED, speed))
+            die_codec(&codec[i], "Failed to set cpu_used");
     }
 
-    if (vpx_codec_destroy(&codec[i]))
-      die_codec(&codec[i], "Failed to destroy codec");
+    /* Set static threshold = 1 for all encoders */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_STATIC_THRESHOLD, 1))
+            die_codec(&codec[i], "Failed to set static threshold");
+    }
 
-    vpx_img_free(&raw[i]);
-    vpx_video_writer_close(writers[i]);
-  }
-  printf("\n");
+    /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */
+    /* Enable denoising for the highest-resolution encoder. */
+    if(vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1))
+        die_codec(&codec[0], "Failed to set noise_sensitivity");
+    for ( i=1; i< NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0))
+            die_codec(&codec[i], "Failed to set noise_sensitivity");
+    }
+
+    /* Set the number of token partitions */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        if(vpx_codec_control(&codec[i], VP8E_SET_TOKEN_PARTITIONS, 1))
+            die_codec(&codec[i], "Failed to set static threshold");
+    }
+
+    /* Set the max intra target bitrate */
+    for ( i=0; i<NUM_ENCODERS; i++)
+    {
+        unsigned int max_intra_size_pct =
+            (int)(((double)cfg[0].rc_buf_optimal_sz * 0.5) * framerate / 10);
+        if(vpx_codec_control(&codec[i], VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                             max_intra_size_pct))
+            die_codec(&codec[i], "Failed to set static threshold");
+       //printf("%d %d \n",i,max_intra_size_pct);
+    }
+
+    frame_avail = 1;
+    got_data = 0;
+
+    while(frame_avail || got_data)
+    {
+        vpx_codec_iter_t iter[NUM_ENCODERS]={NULL};
+        const vpx_codec_cx_pkt_t *pkt[NUM_ENCODERS];
+
+        flags = 0;
+        frame_avail = read_frame_p(infile, &raw[0]);
+
+        if(frame_avail)
+        {
+            for ( i=1; i<NUM_ENCODERS; i++)
+            {
+                /*Scale the image down a number of times by downsampling factor*/
+                /* FilterMode 1 or 2 give better psnr than FilterMode 0. */
+                I420Scale(raw[i-1].planes[VPX_PLANE_Y], raw[i-1].stride[VPX_PLANE_Y],
+                          raw[i-1].planes[VPX_PLANE_U], raw[i-1].stride[VPX_PLANE_U],
+                          raw[i-1].planes[VPX_PLANE_V], raw[i-1].stride[VPX_PLANE_V],
+                          raw[i-1].d_w, raw[i-1].d_h,
+                          raw[i].planes[VPX_PLANE_Y], raw[i].stride[VPX_PLANE_Y],
+                          raw[i].planes[VPX_PLANE_U], raw[i].stride[VPX_PLANE_U],
+                          raw[i].planes[VPX_PLANE_V], raw[i].stride[VPX_PLANE_V],
+                          raw[i].d_w, raw[i].d_h, 1);
+                /* Write out down-sampled input. */
+                length_frame = cfg[i].g_w *  cfg[i].g_h *3/2;
+                if (fwrite(raw[i].planes[0], 1, length_frame,
+                           downsampled_input[NUM_ENCODERS - i - 1]) !=
+                               length_frame)
+                {
+                    return EXIT_FAILURE;
+                }
+            }
+        }
+
+        /* Set the flags (reference and update) for all the encoders.*/
+        for ( i=0; i<NUM_ENCODERS; i++)
+        {
+            layer_id = cfg[i].ts_layer_id[frame_cnt % cfg[i].ts_periodicity];
+            flags = 0;
+            flag_periodicity = periodicity_to_num_layers
+                [num_temporal_layers[i] - 1];
+            flags = layer_flags[i * VPX_TS_MAX_PERIODICITY +
+                                frame_cnt % flag_periodicity];
+            // Key frame flag for first frame.
+            if (frame_cnt == 0)
+            {
+                flags |= VPX_EFLAG_FORCE_KF;
+            }
+            if (frame_cnt > 0 && frame_cnt == key_frame_insert)
+            {
+                flags = VPX_EFLAG_FORCE_KF;
+            }
+
+            vpx_codec_control(&codec[i], VP8E_SET_FRAME_FLAGS, flags);
+            vpx_codec_control(&codec[i], VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        }
+
+        gettimeofday(&tv1, NULL);
+        /* Encode each frame at multi-levels */
+        /* Note the flags must be set to 0 in the encode call if they are set
+           for each frame with the vpx_codec_control(), as done above. */
+        if(vpx_codec_encode(&codec[0], frame_avail? &raw[0] : NULL,
+            frame_cnt, 1, 0, arg_deadline))
+        {
+            die_codec(&codec[0], "Failed to encode frame");
+        }
+        gettimeofday(&tv2, NULL);
+        timersub(&tv2, &tv1, &difftv);
+        cx_time += (double)(difftv.tv_sec * 1000000 + difftv.tv_usec);
+        for (i=NUM_ENCODERS-1; i>=0 ; i--)
+        {
+            got_data = 0;
+            while( (pkt[i] = vpx_codec_get_cx_data(&codec[i], &iter[i])) )
+            {
+                got_data = 1;
+                switch(pkt[i]->kind) {
+                    case VPX_CODEC_CX_FRAME_PKT:
+                        write_ivf_frame_header(outfile[i], pkt[i]);
+                        (void) fwrite(pkt[i]->data.frame.buf, 1,
+                                      pkt[i]->data.frame.sz, outfile[i]);
+                    break;
+                    case VPX_CODEC_PSNR_PKT:
+                        if (show_psnr)
+                        {
+                            int j;
+
+                            psnr_sse_total[i] += pkt[i]->data.psnr.sse[0];
+                            psnr_samples_total[i] += pkt[i]->data.psnr.samples[0];
+                            for (j = 0; j < 4; j++)
+                            {
+                                psnr_totals[i][j] += pkt[i]->data.psnr.psnr[j];
+                            }
+                            psnr_count[i]++;
+                        }
+
+                        break;
+                    default:
+                        break;
+                }
+                printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT
+                       && (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)? "K":"");
+                fflush(stdout);
+            }
+        }
+        frame_cnt++;
+    }
+    printf("\n");
+    printf("FPS for encoding %d %f %f \n", frame_cnt, (float)cx_time / 1000000,
+           1000000 * (double)frame_cnt / (double)cx_time);
+
+    fclose(infile);
+
+    printf("Processed %ld frames.\n",(long int)frame_cnt-1);
+    for (i=0; i< NUM_ENCODERS; i++)
+    {
+        /* Calculate PSNR and print it out */
+        if ( (show_psnr) && (psnr_count[i]>0) )
+        {
+            int j;
+            double ovpsnr = sse_to_psnr(psnr_samples_total[i], 255.0,
+                                        psnr_sse_total[i]);
+
+            fprintf(stderr, "\n ENC%d PSNR (Overall/Avg/Y/U/V)", i);
+
+            fprintf(stderr, " %.3lf", ovpsnr);
+            for (j = 0; j < 4; j++)
+            {
+                fprintf(stderr, " %.3lf", psnr_totals[i][j]/psnr_count[i]);
+            }
+        }
+
+        if(vpx_codec_destroy(&codec[i]))
+            die_codec(&codec[i], "Failed to destroy codec");
+
+        vpx_img_free(&raw[i]);
+
+        if(!outfile[i])
+            continue;
+
+        /* Try to rewrite the file header with the actual frame count */
+        if(!fseek(outfile[i], 0, SEEK_SET))
+            write_ivf_file_header(outfile[i], &cfg[i], frame_cnt-1);
+        fclose(outfile[i]);
+    }
+    printf("\n");
 
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c
index 5e29d808319..a2982821a42 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8cx_set_ref.c
@@ -53,8 +53,8 @@
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c
index 3fcda0cd4bb..54275770d5f 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_lossless_encoder.c
@@ -15,8 +15,8 @@
 #include "vpx/vpx_encoder.h"
 #include "vpx/vp8cx.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
index 53ede94d843..f4deb693b2f 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -19,14 +19,14 @@
 #include <string.h>
 #include <time.h>
 
-#include "./args.h"
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../args.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
-#include "./vpxstats.h"
+#include "../vpxstats.h"
 
 static const arg_def_t skip_frames_arg =
     ARG_DEF("s", "skip-frames", 1, "input frames to skip");
@@ -60,6 +60,11 @@ static const arg_def_t min_bitrate_arg =
     ARG_DEF(NULL, "min-bitrate", 1, "Minimum bitrate");
 static const arg_def_t max_bitrate_arg =
     ARG_DEF(NULL, "max-bitrate", 1, "Maximum bitrate");
+static const arg_def_t lag_in_frame_arg =
+    ARG_DEF(NULL, "lag-in-frames", 1, "Number of frame to input before "
+        "generating any outputs");
+static const arg_def_t rc_end_usage_arg =
+    ARG_DEF(NULL, "rc-end-usage", 1, "0 - 3: VBR, CBR, CQ, Q");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -80,11 +85,11 @@ static const arg_def_t *svc_args[] = {
   &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &spatial_layers_arg,
   &kf_dist_arg,       &scale_factors_arg, &passes_arg,      &pass_arg,
   &fpf_name_arg,      &min_q_arg,         &max_q_arg,       &min_bitrate_arg,
-  &max_bitrate_arg,   &temporal_layers_arg,
+  &max_bitrate_arg,   &temporal_layers_arg,                 &lag_in_frame_arg,
 #if CONFIG_VP9_HIGHBITDEPTH
   &bitdepth_arg,
 #endif
-  NULL
+  &rc_end_usage_arg,  NULL
 };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -207,6 +212,10 @@ static void parse_command_line(int argc, const char **argv_,
       min_bitrate = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
       max_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lag_in_frame_arg, argi)) {
+      enc_cfg->g_lag_in_frames = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &rc_end_usage_arg, argi)) {
+      enc_cfg->rc_end_usage = arg_parse_uint(&arg);
 #if CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &bitdepth_arg, argi)) {
       enc_cfg->g_bit_depth = arg_parse_enum_or_int(&arg);
diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
index ecae2fe6393..349875997b5 100644
--- a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -19,12 +19,12 @@
 #include <string.h>
 
 #include "./vpx_config.h"
-#include "vpx_ports/vpx_timer.h"
+#include "../vpx_ports/vpx_timer.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#include "./tools_common.h"
-#include "./video_writer.h"
+#include "../tools_common.h"
+#include "../video_writer.h"
 
 static const char *exec_name;
 
@@ -61,6 +61,15 @@ struct RateControlMetrics {
   double layer_avg_rate_mismatch[VPX_TS_MAX_LAYERS];
   // Actual encoding bitrate per layer (cumulative).
   double layer_encoding_bitrate[VPX_TS_MAX_LAYERS];
+  // Average of the short-time encoder actual bitrate.
+  // TODO(marpan): Should we add these short-time stats for each layer?
+  double avg_st_encoding_bitrate;
+  // Variance of the short-time encoder actual bitrate.
+  double variance_st_encoding_bitrate;
+  // Window (number of frames) for computing short-timee encoding bitrate.
+  int window_size;
+  // Number of window measurements.
+  int window_count;
 };
 
 // Note: these rate control metrics assume only 1 key frame in the
@@ -92,6 +101,10 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
     rc->layer_avg_frame_size[i] = 0.0;
     rc->layer_avg_rate_mismatch[i] = 0.0;
   }
+  rc->window_count = 0;
+  rc->window_size = 15;
+  rc->avg_st_encoding_bitrate = 0.0;
+  rc->variance_st_encoding_bitrate = 0.0;
 }
 
 static void printout_rate_control_summary(struct RateControlMetrics *rc,
@@ -99,6 +112,7 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
                                           int frame_cnt) {
   unsigned int i = 0;
   int tot_num_frames = 0;
+  double perc_fluctuation = 0.0;
   printf("Total number of processed frames: %d\n\n", frame_cnt -1);
   printf("Rate control layer stats for %d layer(s):\n\n",
       cfg->ts_number_layers);
@@ -125,6 +139,17 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
         100.0 * num_dropped / rc->layer_input_frames[i]);
     printf("\n");
   }
+  rc->avg_st_encoding_bitrate = rc->avg_st_encoding_bitrate / rc->window_count;
+  rc->variance_st_encoding_bitrate =
+      rc->variance_st_encoding_bitrate / rc->window_count -
+      (rc->avg_st_encoding_bitrate * rc->avg_st_encoding_bitrate);
+  perc_fluctuation = 100.0 * sqrt(rc->variance_st_encoding_bitrate) /
+      rc->avg_st_encoding_bitrate;
+  printf("Short-time stats, for window of %d frames: \n",rc->window_size);
+  printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
+         rc->avg_st_encoding_bitrate,
+         sqrt(rc->variance_st_encoding_bitrate),
+         perc_fluctuation);
   if ((frame_cnt - 1) != tot_num_frames)
     die("Error: Number of input frames not equal to output! \n");
 }
@@ -456,7 +481,11 @@ int main(int argc, char **argv) {
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = {0};
   int flag_periodicity = 1;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
   vpx_svc_layer_id_t layer_id = {0, 0};
+#else
+  vpx_svc_layer_id_t layer_id = {0};
+#endif
   const VpxInterface *encoder = NULL;
   FILE *infile = NULL;
   struct RateControlMetrics rc;
@@ -469,6 +498,9 @@ int main(int argc, char **argv) {
 #else
   const int min_args = min_args_base;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  double framerate  = 30.0;
 
   exec_name = argv[0];
   // Check usage and arguments.
@@ -574,12 +606,17 @@ int main(int argc, char **argv) {
   cfg.rc_resize_allowed = 0;
   cfg.rc_min_quantizer = 2;
   cfg.rc_max_quantizer = 56;
+  if (strncmp(encoder->name, "vp9", 3) == 0)
+    cfg.rc_max_quantizer = 52;
   cfg.rc_undershoot_pct = 50;
   cfg.rc_overshoot_pct = 50;
   cfg.rc_buf_initial_sz = 500;
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
 
+  // Use 1 thread as default.
+  cfg.g_threads = 1;
+
   // Enable error resilient mode.
   cfg.g_error_resilient = 1;
   cfg.g_lag_in_frames   = 0;
@@ -604,6 +641,7 @@ int main(int argc, char **argv) {
     die("Failed to open %s for reading", argv[1]);
   }
 
+  framerate = cfg.g_timebase.den / cfg.g_timebase.num;
   // Open an output file for each stream.
   for (i = 0; i < cfg.ts_number_layers; ++i) {
     char file_name[PATH_MAX];
@@ -636,23 +674,28 @@ int main(int argc, char **argv) {
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
       vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
       vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
       vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
-      if (vpx_codec_control(&codec, VP9E_SET_SVC, 1)) {
+      vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+      vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+      if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0)) {
         die_codec(&codec, "Failed to set SVC");
     }
   }
-  vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+  if (strncmp(encoder->name, "vp8", 3) == 0) {
+    vpx_codec_control(&codec, VP8E_SET_SCREEN_CONTENT_MODE, 0);
+  }
   vpx_codec_control(&codec, VP8E_SET_TOKEN_PARTITIONS, 1);
   // This controls the maximum target size of the key frame.
   // For generating smaller key frames, use a smaller max_intra_size_pct
   // value, like 100 or 200.
   {
-    const int max_intra_size_pct = 200;
+    const int max_intra_size_pct = 900;
     vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                       max_intra_size_pct);
   }
@@ -662,14 +705,21 @@ int main(int argc, char **argv) {
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
+#endif
     layer_id.temporal_layer_id =
         cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
     if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
+    } else if (strncmp(encoder->name, "vp8", 3) == 0) {
+      vpx_codec_control(&codec, VP8E_SET_TEMPORAL_LAYER_ID,
+                        layer_id.temporal_layer_id);
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
+    if (layering_mode == 0)
+      flags = 0;
     frame_avail = vpx_img_read(&raw, infile);
     if (frame_avail)
       ++rc.layer_input_frames[layer_id.temporal_layer_id];
@@ -705,6 +755,33 @@ int main(int argc, char **argv) {
               ++rc.layer_enc_frames[i];
             }
           }
+          // Update for short-time encoding bitrate states, for moving window
+          // of size rc->window, shifted by rc->window / 2.
+          // Ignore first window segment, due to key frame.
+          if (frame_cnt > rc.window_size) {
+            sum_bitrate += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+            if (frame_cnt % rc.window_size == 0) {
+              rc.window_count += 1;
+              rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
+              rc.variance_st_encoding_bitrate +=
+                  (sum_bitrate / rc.window_size) *
+                  (sum_bitrate / rc.window_size);
+              sum_bitrate = 0.0;
+            }
+          }
+          // Second shifted window.
+          if (frame_cnt > rc.window_size + rc.window_size / 2) {
+            sum_bitrate2 += 0.001 * 8.0 * pkt->data.frame.sz * framerate;
+            if (frame_cnt > 2 * rc.window_size &&
+                frame_cnt % rc.window_size == 0) {
+              rc.window_count += 1;
+              rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
+              rc.variance_st_encoding_bitrate +=
+                  (sum_bitrate2 / rc.window_size) *
+                  (sum_bitrate2 / rc.window_size);
+              sum_bitrate2 = 0.0;
+            }
+          }
           break;
           default:
             break;
diff --git a/chromium/third_party/libvpx/source/libvpx/libs.doxy_template b/chromium/third_party/libvpx/source/libvpx/libs.doxy_template
index 02e290242b4..5a8f847280e 100644
--- a/chromium/third_party/libvpx/source/libvpx/libs.doxy_template
+++ b/chromium/third_party/libvpx/source/libvpx/libs.doxy_template
@@ -36,7 +36,7 @@ DOXYFILE_ENCODING      = UTF-8
 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded
 # by quotes) that should identify the project.
 
-PROJECT_NAME           = "WebM VP8 Codec SDK"
+PROJECT_NAME           = "WebM Codec SDK"
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
@@ -415,12 +415,6 @@ MAX_INITIALIZER_LINES  = 30
 
 SHOW_USED_FILES        = YES
 
-# If the sources in your project are distributed over multiple directories
-# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
-# in the documentation. The default is NO.
-
-SHOW_DIRECTORIES       = NO
-
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from the
 # version control system). Doxygen will invoke the program by executing (via
@@ -715,12 +709,6 @@ HTML_FOOTER            =
 
 HTML_STYLESHEET        =
 
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
-# files or namespaces will be aligned in HTML using tables. If set to
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
 # If the GENERATE_HTMLHELP tag is set to YES, additional index files
 # will be generated that can be used as input for tools like the
 # Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk
index f9f2d80702f..6eee0039c29 100644
--- a/chromium/third_party/libvpx/source/libvpx/libs.mk
+++ b/chromium/third_party/libvpx/source/libvpx/libs.mk
@@ -18,32 +18,6 @@ else
 endif
 
 #
-# Calculate platform- and compiler-specific offsets for hand coded assembly
-#
-ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
-OFFSET_PATTERN:='^[a-zA-Z0-9_]* EQU'
-define asm_offsets_template
-$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).S
-	@echo "    [CREATE] $$@"
-	$$(qexec)LC_ALL=C grep $$(OFFSET_PATTERN) $$< | tr -d '$$$$\#' $$(ADS2GAS) > $$@
-$$(BUILD_PFX)$(2).S: $(2)
-CLEAN-OBJS += $$(BUILD_PFX)$(1) $(2).S
-endef
-else
-  ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
-define asm_offsets_template
-$$(BUILD_PFX)$(1): obj_int_extract
-$$(BUILD_PFX)$(1): $$(BUILD_PFX)$(2).o
-	@echo "    [CREATE] $$@"
-	$$(qexec)./obj_int_extract rvds $$< $$(ADS2GAS) > $$@
-OBJS-yes += $$(BUILD_PFX)$(2).o
-CLEAN-OBJS += $$(BUILD_PFX)$(1)
-$$(filter %$$(ASM).o,$$(OBJS-yes)): $$(BUILD_PFX)$(1)
-endef
-endif # rvct
-endif # !gcc
-
-#
 # Rule to generate runtime cpu detection files
 #
 define rtcd_h_template
@@ -80,6 +54,9 @@ CODEC_SRCS-yes += $(addprefix vpx_scale/,$(call enabled,SCALE_SRCS))
 include $(SRC_PATH_BARE)/vpx_ports/vpx_ports.mk
 CODEC_SRCS-yes += $(addprefix vpx_ports/,$(call enabled,PORTS_SRCS))
 
+include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
+CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
+
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
   VP8_PREFIX=vp8/
   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8_common.mk
@@ -205,33 +182,13 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(call enabled,CODEC_EXPORTS)
 # based build systems.
 libvpx_srcs.txt:
 	@echo "    [CREATE] $@"
-	@echo $(CODEC_SRCS) | xargs -n1 echo | sort -u > $@
+	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_srcs.txt
 
 
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 
-obj_int_extract.bat: $(SRC_PATH_BARE)/build/$(MSVS_ARCH_DIR)/obj_int_extract.bat
-	@cp $^ $@
-
-obj_int_extract.$(VCPROJ_SFX): obj_int_extract.bat
-obj_int_extract.$(VCPROJ_SFX): $(SRC_PATH_BARE)/build/make/obj_int_extract.c
-	@echo "    [CREATE] $@"
-	$(qexec)$(GEN_VCPROJ) \
-    --exe \
-    --target=$(TOOLCHAIN) \
-    --name=obj_int_extract \
-    --ver=$(CONFIG_VS_VERSION) \
-    --proj-guid=E1360C65-D375-4335-8057-7ED99CC3F9B2 \
-    --src-path-bare="$(SRC_PATH_BARE)" \
-    $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-    --out=$@ $^ \
-    -I. \
-    -I"$(SRC_PATH_BARE)" \
-
-PROJECTS-$(BUILD_LIBVPX) += obj_int_extract.$(VCPROJ_SFX)
-
 vpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
 	$(qexec)$(SRC_PATH_BARE)/build/make/gen_msvs_def.sh\
@@ -246,7 +203,7 @@ ASM_INCLUDES := \
     vpx_config.asm \
     vpx_ports/x86_abi_support.asm \
 
-vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def obj_int_extract.$(VCPROJ_SFX)
+vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             $(if $(CONFIG_SHARED),--dll,--lib) \
@@ -276,25 +233,27 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
 BUILD_LIBVPX_SO         := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
 
+SO_VERSION_MAJOR := 2
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
-LIBVPX_SO               := libvpx.$(VERSION_MAJOR).dylib
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
 EXPORT_FILE             := libvpx.syms
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                              libvpx.dylib  )
 else
-LIBVPX_SO               := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+LIBVPX_SO               := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
 EXPORT_FILE             := libvpx.ver
-SYM_LINK                := libvpx.so
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
-                             libvpx.so libvpx.so.$(VERSION_MAJOR) \
-                             libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR))
+                             libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \
+                             libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
 endif
 
 LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
                            $(notdir $(LIBVPX_SO_SYMLINKS))
 $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE)
 $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
-$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
+$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
 
 libvpx.ver: $(call enabled,CODEC_EXPORTS)
@@ -377,7 +336,7 @@ CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
 endif
 
 #
-# Add assembler dependencies for configuration and offsets
+# Add assembler dependencies for configuration.
 #
 $(filter %.s.o,$(OBJS-yes)):     $(BUILD_PFX)vpx_config.asm
 $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
@@ -402,7 +361,7 @@ libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1)
 
 libvpx_test_srcs.txt:
 	@echo "    [CREATE] $@"
-	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | sort -u > $@
+	@echo $(LIBVPX_TEST_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_test_srcs.txt
 
 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
@@ -535,7 +494,11 @@ libs.doxy: $(CODEC_DOC_SRCS)
 	@echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
 
 ## Generate rtcd.h for all objects
+ifeq ($(CONFIG_DEPENDENCY_TRACKING),yes)
 $(OBJS-yes:.o=.d): $(RTCD)
+else
+$(OBJS-yes): $(RTCD)
+endif
 
 ## Update the global src list
 SRCS += $(CODEC_SRCS) $(LIBVPX_TEST_SRCS) $(GTEST_SRCS)
diff --git a/chromium/third_party/libvpx/source/libvpx/mainpage.dox b/chromium/third_party/libvpx/source/libvpx/mainpage.dox
index e2ec280027e..ec202fa4fb5 100644
--- a/chromium/third_party/libvpx/source/libvpx/mainpage.dox
+++ b/chromium/third_party/libvpx/source/libvpx/mainpage.dox
@@ -1,4 +1,4 @@
-/*!\mainpage WebM VP8 Codec SDK
+/*!\mainpage WebM Codec SDK
 
   \section main_contents Page Contents
   - \ref main_intro
@@ -6,11 +6,11 @@
   - \ref main_support
 
   \section main_intro Introduction
-  Welcome to the WebM VP8 Codec SDK. This SDK allows you to integrate your
-  applications with the VP8 video codec, a high quality, royalty free, open
-  source codec deployed on millions of computers and devices worldwide.
+  Welcome to the WebM Codec SDK. This SDK allows you to integrate your
+  applications with the VP8 and VP9 video codecs, high quality, royalty free,
+  open source codecs deployed on billions of computers and devices worldwide.
 
-  This distribution of the WebM VP8 Codec SDK includes the following support:
+  This distribution of the WebM Codec SDK includes the following support:
 
   \if vp8_encoder
   - \ref vp8_encoder
@@ -28,12 +28,12 @@
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
   - \ref codec reference
-    \if encoder
-    - \ref encoder reference
-    \endif
-    \if decoder
-    - \ref decoder reference
-    \endif
+  \if encoder
+  - \ref encoder reference
+  \endif
+  \if decoder
+  - \ref decoder reference
+  \endif
 
   \section main_support Support Options & FAQ
   The WebM project is an open source project supported by its community. For
diff --git a/chromium/third_party/libvpx/source/libvpx/solution.mk b/chromium/third_party/libvpx/source/libvpx/solution.mk
index 2c8d29a2a1e..145adc0ddad 100644
--- a/chromium/third_party/libvpx/source/libvpx/solution.mk
+++ b/chromium/third_party/libvpx/source/libvpx/solution.mk
@@ -9,7 +9,7 @@
 ##
 
 # libvpx reverse dependencies (targets that depend on libvpx)
-VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest obj_int_extract)
+VPX_NONDEPS=$(addsuffix .$(VCPROJ_SFX),vpx gtest)
 VPX_RDEPS=$(foreach vcp,\
               $(filter-out $(VPX_NONDEPS),$^), --dep=$(vcp:.$(VCPROJ_SFX)=):vpx)
 
@@ -17,7 +17,6 @@ vpx.sln: $(wildcard *.$(VCPROJ_SFX))
 	@echo "    [CREATE] $@"
 	$(SRC_PATH_BARE)/build/make/gen_msvs_sln.sh \
             $(if $(filter vpx.$(VCPROJ_SFX),$^),$(VPX_RDEPS)) \
-            --dep=vpx:obj_int_extract \
             --dep=test_libvpx:gtest \
             --ver=$(CONFIG_VS_VERSION)\
             --out=$@ $^
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx
index 3869d25bc40..6a5b60c063b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1060
+Version: 1305
 License: BSD
 License File: LICENSE
 
@@ -13,4 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times
 in order to encode multiple resolution bit streams.
 
 Local Modifications:
-cherry-pick 'Issue 24479004: Fix building with MSVC for arm'
+cherry pick r1311 'disable nv12 avx2 for vs9/10 that dont support avx2 instructions.'
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h
index 5dfac7c86aa..08b2bb2ecf4 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/compare.h
@@ -22,6 +22,11 @@ extern "C" {
 LIBYUV_API
 uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
 
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
 uint64 ComputeSumSquareError(const uint8* src_a,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h
index 1bd45c837f1..97936adb27d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert.h
@@ -113,15 +113,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
-// Convert Q420 to I420.
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
 // ARGB little endian (bgra in memory) to I420.
 LIBYUV_API
 int ARGBToI420(const uint8* src_frame, int src_stride_frame,
@@ -211,8 +202,6 @@ int MJPGSize(const uint8* sample, size_t sample_size,
              int* width, int* height);
 #endif
 
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h
-
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_y" number of bytes in a row of the dst_y plane.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
index a18014ca2c8..a0de89efd99 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
@@ -18,7 +18,6 @@
 #include "libyuv/rotate.h"
 
 // TODO(fbarchard): This set of functions should exactly match convert.h
-// Add missing Q420.
 // TODO(fbarchard): Add tests. Create random content of right size and convert
 // with C vs Opt and or to I420 and compare.
 // TODO(fbarchard): Some of these functions lack parameter setting.
@@ -104,13 +103,6 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
-// TODO(fbarchard): Convert Q420 to ARGB.
-// LIBYUV_API
-// int Q420ToARGB(const uint8* src_y, int src_stride_y,
-//                const uint8* src_yuy2, int src_stride_yuy2,
-//                uint8* dst_argb, int dst_stride_argb,
-//                int width, int height);
-
 // Convert YUY2 to ARGB.
 LIBYUV_API
 int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
@@ -123,6 +115,22 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_argb, int dst_stride_argb,
                int width, int height);
 
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
 int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
@@ -184,8 +192,6 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
                int dst_width, int dst_height);
 #endif
 
-// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
-
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
 // "src_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h
index b1cf57f7dc0..d6c0e3d8703 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from.h
@@ -57,7 +57,6 @@ int I400Copy(const uint8* src_y, int src_stride_y,
              int width, int height);
 
 // TODO(fbarchard): I420ToM420
-// TODO(fbarchard): I420ToQ420
 
 LIBYUV_API
 int I420ToNV12(const uint8* src_y, int src_stride_y,
@@ -152,8 +151,6 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
                    uint8* dst_frame, int dst_stride_frame,
                    int width, int height);
 
-// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
-
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
index 90f43af04c3..c592fc2353e 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -61,6 +61,13 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
                  uint8* dst_rgb565, int dst_stride_rgb565,
                  int width, int height);
 
+// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+// Values in dither matrix from 0 to 255.  128 is best for no dither.
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither8x8, int width, int height);
+
 // Convert ARGB To ARGB1555.
 LIBYUV_API
 int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
@@ -105,6 +112,14 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
                uint8* dst_v, int dst_stride_v,
                int width, int height);
 
+// Convert ARGB to J422.
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
 // Convert ARGB To I411.
 LIBYUV_API
 int ARGBToI411(const uint8* src_argb, int src_stride_argb,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/format_conversion.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/format_conversion.h
deleted file mode 100644
index b18bf053438..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/format_conversion.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_  // NOLINT
-#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Convert Bayer RGB formats to I420.
-LIBYUV_API
-int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
-    BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
-
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Convert I420 to Bayer RGB formats.
-LIBYUV_API
-int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-LIBYUV_API
-int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    uint8* dst_frame, int dst_stride_frame,
-                    int width, int height);
-
-// Temporary API mapper.
-#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
-    I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
-
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-// Convert Bayer RGB formats to ARGB.
-LIBYUV_API
-int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-LIBYUV_API
-int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
-
-// Temporary API mapper.
-#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer);
-
-// Converts ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-LIBYUV_API
-int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_bayer, int dst_stride_bayer,
-                    int width, int height);
-
-// Temporary API mapper.
-#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
-
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_FORMATCONVERSION_H_  NOLINT
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h
index 4b3c870f91c..80e844bae3c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/row.h
@@ -15,10 +15,6 @@
 
 #include "libyuv/basic_types.h"
 
-#if defined(__native_client__)
-#include "ppapi/c/pp_macros.h"  // For PPAPI_RELEASE
-#endif
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -43,6 +39,7 @@ extern "C" {
 
 #if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
     defined(TARGET_IPHONE_SIMULATOR) || \
+    (defined(__i386__) && !defined(__SSE2__)) || \
     (defined(_MSC_VER) && defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
@@ -51,48 +48,16 @@ extern "C" {
 #define LIBYUV_SSSE3_ONLY
 #endif
 
-// Enable for NaCL pepper 33 for bundle and AVX2 support.
-#if defined(__native_client__) && PPAPI_RELEASE >= 33
-#define NEW_BINUTILS
-#endif
-#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
+// clang >= 3.5.0 required for Arm64.
+#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
+#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
 #define LIBYUV_DISABLE_NEON
-#endif
+#endif  // clang >= 3.5
+#endif  // __clang__
 
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Effects:
-#define HAS_ARGBADDROW_SSE2
-#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
-#define HAS_ARGBBLENDROW_SSSE3
-#define HAS_ARGBCOLORMATRIXROW_SSSE3
-#define HAS_ARGBCOLORTABLEROW_X86
-#define HAS_ARGBCOPYALPHAROW_SSE2
-#define HAS_ARGBCOPYYTOALPHAROW_SSE2
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSSE3
-#define HAS_ARGBMULTIPLYROW_SSE2
-#define HAS_ARGBPOLYNOMIALROW_SSE2
-#define HAS_ARGBQUANTIZEROW_SSE2
-#define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBTOUVROW_SSSE3
-#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_RGBCOLORTABLEROW_X86
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELTOPLANEROW_SSE2
-#define HAS_SOBELXROW_SSE2
-#define HAS_SOBELXYROW_SSE2
-#define HAS_SOBELYROW_SSE2
-
 // Conversions:
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
@@ -103,24 +68,21 @@ extern "C" {
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
 #define HAS_ARGBTOBAYERGGROW_SSE2
-#define HAS_ARGBTOBAYERROW_SSSE3
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565ROW_SSE2
 #define HAS_ARGBTOUV422ROW_SSSE3
 #define HAS_ARGBTOUV444ROW_SSSE3
 #define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
-#define HAS_COPYROW_X86
-#define HAS_HALFROW_SSE2
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I411TOARGBROW_SSSE3
-#define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOABGRROW_SSSE3
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
@@ -133,6 +95,7 @@ extern "C" {
 #define HAS_I422TOUYVYROW_SSE2
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
+// #define HAS_J422TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #define HAS_MIRRORROW_SSSE3
@@ -150,6 +113,8 @@ extern "C" {
 #define HAS_RGBATOUVROW_SSSE3
 #define HAS_RGBATOYROW_SSSE3
 #define HAS_SETROW_X86
+#define HAS_SETROW_ERMS
+#define HAS_ARGBSETROW_X86
 #define HAS_SPLITUVROW_SSE2
 #define HAS_UYVYTOARGBROW_SSSE3
 #define HAS_UYVYTOUV422ROW_SSE2
@@ -160,6 +125,36 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
+
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSE2
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
 #endif
 
 // The following are available on x64 Visual C:
@@ -186,26 +181,39 @@ extern "C" {
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 
+// The following are available require VS2012.  Port to GCC.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+// TODO(fbarchard): fix AVX2 versions of YUV conversion.  bug=393
+#define HAS_I422TOABGRROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_I422TOBGRAROW_AVX2
+#define HAS_I422TORGBAROW_AVX2
+#define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV21TOARGBROW_AVX2
+#define HAS_ARGBTORGB565ROW_AVX2
+#define HAS_ARGBTOARGB1555ROW_AVX2
+#define HAS_ARGBTOARGB4444ROW_AVX2
+#define HAS_NV12TORGB565ROW_AVX2
+#define HAS_NV21TORGB565ROW_AVX2
+#define HAS_I422TORGB565ROW_AVX2
+#define HAS_I422TOARGB1555ROW_AVX2
+#define HAS_I422TOARGB4444ROW_AVX2
+#endif
+
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
 // The code supports NaCL but requires a new compiler and validator.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
     defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
-// Effects:
-#define HAS_ARGBPOLYNOMIALROW_AVX2
-#define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBCOPYALPHAROW_AVX2
 #define HAS_ARGBCOPYYTOALPHAROW_AVX2
-#endif
-
-// The following are require VS2012.
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBPOLYNOMIALROW_AVX2
+#define HAS_ARGBSHUFFLEROW_AVX2
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
-#define HAS_HALFROW_AVX2
-#define HAS_I422TOARGBROW_AVX2
+#define HAS_COPYROW_AVX
 #define HAS_INTERPOLATEROW_AVX2
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
@@ -213,18 +221,25 @@ extern "C" {
 #define HAS_UYVYTOUV422ROW_AVX2
 #define HAS_UYVYTOUVROW_AVX2
 #define HAS_UYVYTOYROW_AVX2
+#define HAS_YTOARGBROW_AVX2
 #define HAS_YUY2TOUV422ROW_AVX2
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
 
+// The following require HAS_I422TOARGBROW_AVX2
+#if defined(HAS_I422TOARGBROW_AVX2)
+#define HAS_YUY2TOARGBROW_AVX2
+#define HAS_UYVYTOARGBROW_AVX2
+#endif
+
 // Effects:
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBATTENUATEROW_AVX2
-#define HAS_ARGBMIRRORROW_AVX2
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
-#endif  // defined(VISUALC_HAS_AVX2)
+#endif
+
 
 // The following are Yasm x86 only:
 // TODO(fbarchard): Port AVX2 to inline.
@@ -245,106 +260,14 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
     !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBBLENDROW_SSE2
 #define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #endif
 
-// The following are available on arm64 platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-// #define HAS_I444TOARGBROW_NEON
-// #define HAS_I422TOARGBROW_NEON
-// #define HAS_I411TOARGBROW_NEON
-// #define HAS_I422TOBGRAROW_NEON
-// #define HAS_I422TOABGRROW_NEON
-// #define HAS_I422TORGBAROW_NEON
-// #define HAS_I422TORGB24ROW_NEON
-// #define HAS_I422TORAWROW_NEON
-// #define HAS_I422TORGB565ROW_NEON
-// #define HAS_I422TOARGB1555ROW_NEON
-// #define HAS_I422TOARGB4444ROW_NEON
-// #define HAS_YTOARGBROW_NEON
-// #define HAS_I400TOARGBROW_NEON
-// #define HAS_NV12TOARGBROW_NEON
-// #define HAS_NV21TOARGBROW_NEON
-// #define HAS_NV12TORGB565ROW_NEON
-// #define HAS_NV21TORGB565ROW_NEON
-// #define HAS_YUY2TOARGBROW_NEON
-// #define HAS_UYVYTOARGBROW_NEON
-#define HAS_SPLITUVROW_NEON
-#define HAS_MERGEUVROW_NEON
-#define HAS_COPYROW_NEON
-#define HAS_SETROW_NEON
-#define HAS_ARGBSETROWS_NEON
-#define HAS_MIRRORROW_NEON
-#define HAS_MIRRORUVROW_NEON
-#define HAS_ARGBMIRRORROW_NEON
-#define HAS_RGB24TOARGBROW_NEON
-#define HAS_RAWTOARGBROW_NEON
-// #define HAS_RGB565TOARGBROW_NEON
-// #define HAS_ARGB1555TOARGBROW_NEON
-// #define HAS_ARGB4444TOARGBROW_NEON
-#define HAS_ARGBTORGB24ROW_NEON
-#define HAS_ARGBTORAWROW_NEON
-#define HAS_YUY2TOYROW_NEON
-#define HAS_UYVYTOYROW_NEON
-#define HAS_YUY2TOUV422ROW_NEON
-#define HAS_UYVYTOUV422ROW_NEON
-#define HAS_YUY2TOUVROW_NEON
-#define HAS_UYVYTOUVROW_NEON
-#define HAS_HALFROW_NEON
-#define HAS_ARGBTOBAYERROW_NEON
-#define HAS_ARGBTOBAYERGGROW_NEON
-#define HAS_ARGBSHUFFLEROW_NEON
-#define HAS_I422TOYUY2ROW_NEON
-#define HAS_I422TOUYVYROW_NEON
-// #define HAS_ARGBTORGB565ROW_NEON
-// #define HAS_ARGBTOARGB1555ROW_NEON
-// #define HAS_ARGBTOARGB4444ROW_NEON
-#define HAS_ARGBTOYROW_NEON
-#define HAS_ARGBTOYJROW_NEON
-// #define HAS_ARGBTOUV444ROW_NEON
-// #define HAS_ARGBTOUV422ROW_NEON
-// #define HAS_ARGBTOUV411ROW_NEON
-// #define HAS_ARGBTOUVROW_NEON
-// #define HAS_ARGBTOUVJROW_NEON
-// #define HAS_BGRATOUVROW_NEON
-// #define HAS_ABGRTOUVROW_NEON
-// #define HAS_RGBATOUVROW_NEON
-// #define HAS_RGB24TOUVROW_NEON
-// #define HAS_RAWTOUVROW_NEON
-// #define HAS_RGB565TOUVROW_NEON
-// #define HAS_ARGB1555TOUVROW_NEON
-// #define HAS_ARGB4444TOUVROW_NEON
-// #define HAS_RGB565TOYROW_NEON
-// #define HAS_ARGB1555TOYROW_NEON
-// #define HAS_ARGB4444TOYROW_NEON
-// #define HAS_BGRATOYROW_NEON
-// #define HAS_ABGRTOYROW_NEON
-// #define HAS_RGBATOYROW_NEON
-// #define HAS_RGB24TOYROW_NEON
-// #define HAS_RAWTOYROW_NEON
-// #define HAS_INTERPOLATEROW_NEON
-// #define HAS_ARGBBLENDROW_NEON
-// #define HAS_ARGBATTENUATEROW_NEON
-// #define HAS_ARGBQUANTIZEROW_NEON
-// #define HAS_ARGBSHADEROW_NEON
-// #define HAS_ARGBGRAYROW_NEON
-// #define HAS_ARGBSEPIAROW_NEON
-// #define HAS_ARGBCOLORMATRIXROW_NEON
-#define HAS_ARGBMULTIPLYROW_NEON
-#define HAS_ARGBADDROW_NEON
-#define HAS_ARGBSUBTRACTROW_NEON
-#define HAS_SOBELROW_NEON
-#define HAS_SOBELTOPLANEROW_NEON
-#define HAS_SOBELXYROW_NEON
-#define HAS_SOBELXROW_NEON
-#define HAS_SOBELYROW_NEON
-#endif
-
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOUVROW_NEON
 #define HAS_ABGRTOYROW_NEON
 #define HAS_ARGB1555TOARGBROW_NEON
@@ -355,7 +278,6 @@ extern "C" {
 #define HAS_ARGB4444TOYROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
-#define HAS_ARGBTOBAYERROW_NEON
 #define HAS_ARGBTOBAYERGGROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
@@ -363,14 +285,13 @@ extern "C" {
 #define HAS_ARGBTOUV411ROW_NEON
 #define HAS_ARGBTOUV422ROW_NEON
 #define HAS_ARGBTOUV444ROW_NEON
-#define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
-#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
+#define HAS_ARGBTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_COPYROW_NEON
-#define HAS_HALFROW_NEON
 #define HAS_I400TOARGBROW_NEON
 #define HAS_I411TOARGBROW_NEON
 #define HAS_I422TOABGRROW_NEON
@@ -404,6 +325,7 @@ extern "C" {
 #define HAS_RGBATOUVROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
+#define HAS_ARGBSETROW_NEON
 #define HAS_SPLITUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
@@ -426,25 +348,25 @@ extern "C" {
 #define HAS_ARGBSEPIAROW_NEON
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_INTERPOLATEROW_NEON
 #define HAS_SOBELROW_NEON
 #define HAS_SOBELTOPLANEROW_NEON
-#define HAS_SOBELXYROW_NEON
 #define HAS_SOBELXROW_NEON
+#define HAS_SOBELXYROW_NEON
 #define HAS_SOBELYROW_NEON
-#define HAS_INTERPOLATEROW_NEON
-// TODO(fbarchard): Investigate neon unittest failure.
-// #define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBSHUFFLEROW_NEON
 #endif
 
 // The following are available on Mips platforms:
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
 #define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_I422TOABGRROW_MIPS_DSPR2
 #define HAS_I422TOARGBROW_MIPS_DSPR2
 #define HAS_I422TOBGRAROW_MIPS_DSPR2
-#define HAS_INTERPOLATEROWS_MIPS_DSPR2
+#define HAS_INTERPOLATEROW_MIPS_DSPR2
 #define HAS_MIRRORROW_MIPS_DSPR2
 #define HAS_MIRRORUVROW_MIPS_DSPR2
 #define HAS_SPLITUVROW_MIPS_DSPR2
@@ -453,6 +375,7 @@ extern "C" {
 
 #if defined(_MSC_VER) && !defined(__CLR_VER)
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
+#define SIMD_ALIGNED32(var) __declspec(align(64)) var
 typedef __declspec(align(16)) int16 vec16[8];
 typedef __declspec(align(16)) int32 vec32[4];
 typedef __declspec(align(16)) int8 vec8[16];
@@ -469,20 +392,34 @@ typedef __declspec(align(32)) uint8 ulvec8[32];
 #elif defined(__GNUC__)
 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#define SIMD_ALIGNED32(var) var __attribute__((aligned(64)))
 typedef int16 __attribute__((vector_size(16))) vec16;
 typedef int32 __attribute__((vector_size(16))) vec32;
 typedef int8 __attribute__((vector_size(16))) vec8;
 typedef uint16 __attribute__((vector_size(16))) uvec16;
 typedef uint32 __attribute__((vector_size(16))) uvec32;
 typedef uint8 __attribute__((vector_size(16))) uvec8;
+typedef int16 __attribute__((vector_size(32))) lvec16;
+typedef int32 __attribute__((vector_size(32))) lvec32;
+typedef int8 __attribute__((vector_size(32))) lvec8;
+typedef uint16 __attribute__((vector_size(32))) ulvec16;
+typedef uint32 __attribute__((vector_size(32))) ulvec32;
+typedef uint8 __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
+#define SIMD_ALIGNED32(var) var
 typedef int16 vec16[8];
 typedef int32 vec32[4];
 typedef int8 vec8[16];
 typedef uint16 uvec16[8];
 typedef uint32 uvec32[4];
 typedef uint8 uvec8[16];
+typedef int16 lvec16[16];
+typedef int32 lvec32[8];
+typedef int8 lvec8[32];
+typedef uint16 ulvec16[16];
+typedef uint32 ulvec32[8];
+typedef uint8 ulvec8[32];
 #endif
 
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
@@ -492,24 +429,16 @@ typedef uint8 uvec8[16];
 #endif
 
 // NaCL macros for GCC x86 and x64.
-
-// TODO(nfullagar): When pepper_33 toolchain is distributed, default to
-// NEW_BINUTILS and remove all BUNDLEALIGN occurances.
 #if defined(__native_client__)
 #define LABELALIGN ".p2align 5\n"
 #else
-#define LABELALIGN ".p2align 2\n"
+#define LABELALIGN
 #endif
 #if defined(__native_client__) && defined(__x86_64__)
-#if defined(NEW_BINUTILS)
+// r14 is used for MEMOP macros.
+#define NACL_R14 "r14",
 #define BUNDLELOCK ".bundle_lock\n"
 #define BUNDLEUNLOCK ".bundle_unlock\n"
-#define BUNDLEALIGN "\n"
-#else
-#define BUNDLELOCK "\n"
-#define BUNDLEUNLOCK "\n"
-#define BUNDLEALIGN ".p2align 5\n"
-#endif
 #define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
 #define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
 #define MEMLEA(offset, base) #offset "(%q" #base ")"
@@ -534,8 +463,19 @@ typedef uint8 uvec8[16];
     "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
     #opcode " (%%r15,%%r14),%" #arg "\n" \
     BUNDLEUNLOCK
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
+    BUNDLEUNLOCK
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    BUNDLELOCK \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
+    BUNDLEUNLOCK
 #else  // defined(__native_client__) && defined(__x86_64__)
-#define BUNDLEALIGN "\n"
+#define NACL_R14
+#define BUNDLEALIGN
 #define MEMACCESS(base) "(%" #base ")"
 #define MEMACCESS2(offset, base) #offset "(%" #base ")"
 #define MEMLEA(offset, base) #offset "(%" #base ")"
@@ -551,14 +491,19 @@ typedef uint8 uvec8[16];
     #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
 #define MEMOPARG(opcode, offset, base, index, scale, arg) \
     #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
+    #reg2 "\n"
+#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
+    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
 #endif  // defined(__native_client__) && defined(__x86_64__)
 
 #if defined(__arm__) || defined(__aarch64__)
 #undef MEMACCESS
 #if defined(__native_client__)
-#define MEMACCESS(base) ".p2align   3\nbic %" #base ", #0xc0000000\n"
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
 #else
-#define MEMACCESS(base) "\n"
+#define MEMACCESS(base)
 #endif
 #endif
 
@@ -651,13 +596,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
 void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
 void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
-void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
-void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
@@ -736,16 +674,6 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
                        uint8* dst_u, uint8* dst_v, int width);
 void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
                        uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
-                                  uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                                 uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                                 uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
                            uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
@@ -807,15 +735,11 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
 
 void ARGBToUV444Row_SSSE3(const uint8* src_argb,
                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,
-                                    uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
                               uint8* dst_u, uint8* dst_v, int width);
 
 void ARGBToUV422Row_SSSE3(const uint8* src_argb,
                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,
-                                    uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
                               uint8* dst_u, uint8* dst_v, int width);
 
@@ -825,6 +749,8 @@ void ARGBToUV422Row_C(const uint8* src_argb,
                       uint8* dst_u, uint8* dst_v, int width);
 void ARGBToUV411Row_C(const uint8* src_argb,
                       uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width);
 
 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
@@ -832,6 +758,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
 void MirrorRow_C(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
 
 void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                        int width);
@@ -843,9 +773,12 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                    int width);
 
 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
 void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
 
 void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
@@ -853,10 +786,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int pix);
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix);
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
-                                     uint8* dst_v, int pix);
 void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                          int pix);
 void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
@@ -874,8 +803,6 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width);
 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width);
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width);
 void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                          int width);
 void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
@@ -884,11 +811,14 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                          int width);
 
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_AVX(const uint8* src, uint8* dst, int count);
 void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_X86(const uint8* src, uint8* dst, int count);
 void CopyRow_NEON(const uint8* src, uint8* dst, int count);
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
+void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
 
 void CopyRow_16_C(const uint16* src, uint16* dst, int count);
 
@@ -900,15 +830,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
 void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
 void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
 
-void SetRow_X86(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
-                     int dst_stride, int height);
-void SetRow_NEON(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
-                      int dst_stride, int height);
-void SetRow_C(uint8* dst, uint32 v32, int count);
-void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
-                   int height);
+void SetRow_C(uint8* dst, uint8 v8, int count);
+void SetRow_X86(uint8* dst, uint8 v8, int count);
+void SetRow_ERMS(uint8* dst, uint8 v8, int count);
+void SetRow_NEON(uint8* dst, uint8 v8, int count);
+void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
+void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
+
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
+void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
 
 // ARGBShufflers for BGRAToARGB etc.
 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
@@ -921,8 +853,6 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix);
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix);
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                    const uint8* shuffler, int pix);
 void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
                              const uint8* shuffler, int pix);
 void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
@@ -975,6 +905,10 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -988,8 +922,10 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint8* dither8x8, int pix);
+
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
 void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
@@ -1032,6 +968,11 @@ void YUY2ToARGBRow_C(const uint8* src_yuy2,
 void UYVYToARGBRow_C(const uint8* src_uyvy,
                      uint8* dst_argb,
                      int width);
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* dst_argb,
+                     int width);
 void I422ToBGRARow_C(const uint8* src_y,
                      const uint8* src_u,
                      const uint8* src_v,
@@ -1072,14 +1013,26 @@ void I422ToRGB565Row_C(const uint8* src_y,
                        const uint8* src_v,
                        uint8* dst_rgb565,
                        int width);
-void YToARGBRow_C(const uint8* src_y,
-                  uint8* dst_argb,
-                  int width);
 void I422ToARGBRow_AVX2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_argb,
                         int width);
+void I422ToBGRARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToRGBARow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
+void I422ToABGRRow_AVX2(const uint8* src_y,
+                        const uint8* src_u,
+                        const uint8* src_v,
+                        uint8* dst_argb,
+                        int width);
 void I444ToARGBRow_SSSE3(const uint8* src_y,
                          const uint8* src_u,
                          const uint8* src_v,
@@ -1103,6 +1056,14 @@ void NV21ToARGBRow_SSSE3(const uint8* src_y,
                          const uint8* src_vu,
                          uint8* dst_argb,
                          int width);
+void NV12ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_uv,
+                        uint8* dst_argb,
+                        int width);
+void NV21ToARGBRow_AVX2(const uint8* src_y,
+                        const uint8* src_vu,
+                        uint8* dst_argb,
+                        int width);
 void NV12ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_uv,
                            uint8* dst_argb,
@@ -1111,12 +1072,31 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_vu,
                            uint8* dst_argb,
                            int width);
+void NV12ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_uv,
+                          uint8* dst_argb,
+                          int width);
+void NV21ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_vu,
+                          uint8* dst_argb,
+                          int width);
 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
                          uint8* dst_argb,
                          int width);
 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
                          uint8* dst_argb,
                          int width);
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
+                        uint8* dst_argb,
+                        int width);
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
+                        uint8* dst_argb,
+                        int width);
+void J422ToARGBRow_SSSE3(const uint8* src_y,
+                         const uint8* src_u,
+                         const uint8* src_v,
+                         uint8* dst_argb,
+                         int width);
 void I422ToBGRARow_SSSE3(const uint8* src_y,
                          const uint8* src_u,
                          const uint8* src_v,
@@ -1137,17 +1117,31 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
                              const uint8* src_v,
                              uint8* dst_argb,
                              int width);
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
                              uint8* dst_argb,
                              int width);
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I422ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
                            uint8* dst_argb,
                            int width);
-// RGB24/RAW are unaligned.
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_argb,
+                          int width);
 void I422ToRGB24Row_SSSE3(const uint8* src_y,
                           const uint8* src_u,
                           const uint8* src_v,
@@ -1158,56 +1152,26 @@ void I422ToRAWRow_SSSE3(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_raw,
                         int width);
-
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_argb,
-                                   int width);
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_argb,
-                                   int width);
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_argb,
-                                   int width);
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_uv,
-                                   uint8* dst_argb,
-                                   int width);
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_vu,
-                                   uint8* dst_argb,
-                                   int width);
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
-                                   uint8* dst_argb,
-                                   int width);
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
-                                   uint8* dst_argb,
-                                   int width);
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_bgra,
-                                   int width);
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_abgr,
-                                   int width);
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,
-                                   const uint8* src_u,
-                                   const uint8* src_v,
-                                   uint8* dst_rgba,
-                                   int width);
 void I422ToARGBRow_Any_AVX2(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
+void I422ToBGRARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToRGBARow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
+void I422ToABGRRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb,
+                            int width);
 void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
@@ -1231,6 +1195,14 @@ void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
                              const uint8* src_vu,
                              uint8* dst_argb,
                              int width);
+void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_uv,
+                            uint8* dst_argb,
+                            int width);
+void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
+                            const uint8* src_vu,
+                            uint8* dst_argb,
+                            int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
                                const uint8* src_uv,
                                uint8* dst_argb,
@@ -1239,12 +1211,26 @@ void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
                                const uint8* src_vu,
                                uint8* dst_argb,
                                int width);
+void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_uv,
+                              uint8* dst_argb,
+                              int width);
+void NV21ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_vu,
+                              uint8* dst_argb,
+                              int width);
 void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
                              uint8* dst_argb,
                              int width);
 void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
                              uint8* dst_argb,
                              int width);
+void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
+                            uint8* dst_argb,
+                            int width);
+void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
+                            uint8* dst_argb,
+                            int width);
 void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
@@ -1265,17 +1251,31 @@ void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
                                  const uint8* src_v,
                                  uint8* dst_rgba,
                                  int width);
+void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
 void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
                                  const uint8* src_u,
                                  const uint8* src_v,
                                  uint8* dst_rgba,
                                  int width);
+void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
+                                const uint8* src_u,
+                                const uint8* src_v,
+                                uint8* dst_rgba,
+                                int width);
 void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
                                const uint8* src_u,
                                const uint8* src_v,
                                uint8* dst_rgba,
                                int width);
-// RGB24/RAW are unaligned.
+void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
+                              const uint8* src_u,
+                              const uint8* src_v,
+                              uint8* dst_rgba,
+                              int width);
 void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
                               const uint8* src_u,
                               const uint8* src_v,
@@ -1286,15 +1286,25 @@ void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
                             const uint8* src_v,
                             uint8* dst_argb,
                             int width);
+
+void YToARGBRow_C(const uint8* src_y,
+                  uint8* dst_argb,
+                  int width);
 void YToARGBRow_SSE2(const uint8* src_y,
                      uint8* dst_argb,
                      int width);
+void YToARGBRow_AVX2(const uint8* src_y,
+                     uint8* dst_argb,
+                     int width);
 void YToARGBRow_NEON(const uint8* src_y,
                      uint8* dst_argb,
                      int width);
 void YToARGBRow_Any_SSE2(const uint8* src_y,
                          uint8* dst_argb,
                          int width);
+void YToARGBRow_Any_AVX2(const uint8* src_y,
+                         uint8* dst_argb,
+                         int width);
 void YToARGBRow_Any_NEON(const uint8* src_y,
                          uint8* dst_argb,
                          int width);
@@ -1365,6 +1375,10 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
 
+void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
 void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
@@ -1489,12 +1503,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix);
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix);
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix);
 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix);
@@ -1530,12 +1538,6 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix);
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix);
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix);
 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix);
@@ -1568,28 +1570,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
 void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
                              uint8* dst_u, uint8* dst_v, int pix);
 
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-               uint8* dst_uv, int pix);
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix);
-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix);
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix);
-
-void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
-                  uint16* dst_uv, int pix);
-
-void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
-                      uint32 selector, int pix);
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                          uint32 selector, int pix);
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix);
-void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                              uint32 selector, int pix);
-void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
-                             uint32 selector, int pix);
 void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
                         uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
@@ -1736,15 +1716,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride_ptr, int width,
                          int source_y_fraction);
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                ptrdiff_t src_stride_ptr, int width,
-                                int source_y_fraction);
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride_ptr, int width,
-                                   int source_y_fraction);
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride_ptr, int width,
-                                    int source_y_fraction);
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride_ptr, int width,
+                               int source_y_fraction);
 void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
                              ptrdiff_t src_stride_ptr, int width,
                              int source_y_fraction);
@@ -1757,9 +1731,9 @@ void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
                              ptrdiff_t src_stride_ptr, int width,
                              int source_y_fraction);
-void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride_ptr, int width,
-                                    int source_y_fraction);
+void InterpolateRow_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                                   ptrdiff_t src_stride_ptr, int width,
+                                   int source_y_fraction);
 
 void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
                          ptrdiff_t src_stride_ptr,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h
index a3bc07e0fd6..102158d1ab2 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale.h
@@ -34,6 +34,7 @@ void ScalePlane(const uint8* src, int src_stride,
                 int dst_width, int dst_height,
                 enum FilterMode filtering);
 
+LIBYUV_API
 void ScalePlane_16(const uint16* src, int src_stride,
                    int src_width, int src_height,
                    uint16* dst, int dst_stride,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
index 3c495424f1a..27aa04b2202 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/scale_row.h
@@ -44,21 +44,13 @@ extern "C" {
 
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEROWDOWN2_NEON
 #define HAS_SCALEROWDOWN4_NEON
 #define HAS_SCALEROWDOWN34_NEON
 #define HAS_SCALEROWDOWN38_NEON
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
-#elif !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__aarch64__) || defined(LIBYUV_NEON))
-/* #define HAS_SCALEROWDOWN2_NEON */
-/* #define HAS_SCALEROWDOWN4_NEON */
-/* #define HAS_SCALEROWDOWN34_NEON */
-/* #define HAS_SCALEROWDOWN38_NEON */
-/* #define HAS_SCALEARGBROWDOWNEVEN_NEON */
-/* #define HAS_SCALEARGBROWDOWN2_NEON */
 #endif
 
 // The following are available on Mips platforms:
@@ -208,15 +200,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                               uint8* dst_ptr, int dst_width);
 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width);
 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
@@ -267,10 +250,10 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
                            int dst_width, int x, int dx);
 // Row functions.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                int src_stepx,
                                uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                   int src_stepx,
                                   uint8* dst_argb, int dst_width);
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h
index 73a7f1b019b..9236a7fa1f5 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1059
+#define LIBYUV_VERSION 1305
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h
index 91acc2ffcf9..cb6582f24dc 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/include/libyuv/video_common.h
@@ -62,7 +62,7 @@ enum FourCC {
 
   // 2 Secondary YUV formats: row biplanar.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
 
   // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
@@ -75,7 +75,7 @@ enum FourCC {
   FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
   FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
 
-  // 4 Secondary RGB formats: 4 Bayer Patterns.
+  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
   FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
   FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
   FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc
index dc715e0199c..f84a08ee6c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare.cc
@@ -19,6 +19,7 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
+#include "libyuv/video_common.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -78,6 +79,54 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
   return seed;
 }
 
+static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+    if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
+      return FOURCC_BGRA;
+    }
+    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+      return FOURCC_ARGB;
+    }
+    argb += 8;
+  }
+  if (width & 1) {
+    if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
+      return FOURCC_BGRA;
+    }
+    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+      return FOURCC_ARGB;
+    }
+  }
+  return 0;
+}
+
+// Scan an opaque argb image and return fourcc based on alpha offset.
+// Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
+LIBYUV_API
+uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
+  uint32 fourcc = 0;
+  int h;
+
+  // Coalesce rows.
+  if (stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    stride_argb = 0;
+  }
+  for (h = 0; h < height && fourcc == 0; ++h) {
+    fourcc = ARGBDetectRow_C(argb, width);
+    argb += stride_argb;
+  }
+  return fourcc;
+}
+
 uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
@@ -114,8 +163,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
   }
 #endif
 #if defined(HAS_SUMSQUAREERROR_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     // Note only used for multiples of 16 so count is not checked.
     SumSquareError = SumSquareError_SSE2;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc
index 55052c0eecb..ef006ec41cd 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon.cc
@@ -16,7 +16,8 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
   volatile uint32 sse;
@@ -56,46 +57,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
   return sse;
 }
 
-#elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "eor        v16.16b, v16.16b, v16.16b      \n"
-    "eor        v18.16b, v18.16b, v18.16b      \n"
-    "eor        v17.16b, v17.16b, v17.16b      \n"
-    "eor        v19.16b, v19.16b, v19.16b      \n"
-
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %2, %2, #16                    \n"
-    "usubl      v2.8h, v0.8b, v1.8b            \n"
-    "usubl2     v3.8h, v0.16b, v1.16b          \n"
-    "smlal      v16.4s, v2.4h, v2.4h           \n"
-    "smlal      v17.4s, v3.4h, v3.4h           \n"
-    "smlal2     v18.4s, v2.8h, v2.8h           \n"
-    "smlal2     v19.4s, v3.8h, v3.8h           \n"
-    "bgt        1b                             \n"
-
-    "add        v16.4s, v16.4s, v17.4s         \n"
-    "add        v18.4s, v18.4s, v19.4s         \n"
-    "add        v19.4s, v16.4s, v18.4s         \n"
-    "addv       s0, v19.4s                     \n"
-    "fmov       %w3, s0                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
-  return sse;
-}
-
-#endif  // __ARM_NEON__
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon64.cc
new file mode 100644
index 00000000000..cc078f84cd8
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_neon64.cc
@@ -0,0 +1,63 @@
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "eor        v16.16b, v16.16b, v16.16b      \n"
+    "eor        v18.16b, v18.16b, v18.16b      \n"
+    "eor        v17.16b, v17.16b, v17.16b      \n"
+    "eor        v19.16b, v19.16b, v19.16b      \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    MEMACCESS(0)
+    "ld1        {v0.16b}, [%0], #16            \n"
+    MEMACCESS(1)
+    "ld1        {v1.16b}, [%1], #16            \n"
+    "subs       %2, %2, #16                    \n"
+    "usubl      v2.8h, v0.8b, v1.8b            \n"
+    "usubl2     v3.8h, v0.16b, v1.16b          \n"
+    "smlal      v16.4s, v2.4h, v2.4h           \n"
+    "smlal      v17.4s, v3.4h, v3.4h           \n"
+    "smlal2     v18.4s, v2.8h, v2.8h           \n"
+    "smlal2     v19.4s, v3.8h, v3.8h           \n"
+    "b.gt       1b                             \n"
+
+    "add        v16.4s, v16.4s, v17.4s         \n"
+    "add        v18.4s, v18.4s, v19.4s         \n"
+    "add        v19.4s, v16.4s, v18.4s         \n"
+    "addv       s0, v19.4s                     \n"
+    "fmov       %w3, s0                        \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+  return sse;
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc
index ac361190e88..247cb33bbaa 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_posix.cc
@@ -25,11 +25,10 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
     "pxor      %%xmm5,%%xmm5                   \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
     "lea       " MEMLEA(0x10, 1) ",%1          \n"
-    "sub       $0x10,%2                        \n"
     "movdqa    %%xmm1,%%xmm3                   \n"
     "psubusb   %%xmm2,%%xmm1                   \n"
     "psubusb   %%xmm3,%%xmm2                   \n"
@@ -41,6 +40,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
     "pmaddwd   %%xmm2,%%xmm2                   \n"
     "paddd     %%xmm1,%%xmm0                   \n"
     "paddd     %%xmm2,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
 
     "pshufd    $0xee,%%xmm0,%%xmm1             \n"
@@ -53,11 +53,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
     "+r"(src_b),      // %1
     "+r"(count),      // %2
     "=g"(sse)         // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );  // NOLINT
   return sse;
 }
@@ -124,13 +120,13 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     "pmulld    %%xmm5,%%xmm1                   \n"
     "paddd     %%xmm4,%%xmm3                   \n"
     "paddd     %%xmm2,%%xmm1                   \n"
-    "sub       $0x10,%1                        \n"
     "paddd     %%xmm3,%%xmm1                   \n"
     "pshufd    $0xe,%%xmm1,%%xmm2              \n"
     "paddd     %%xmm2,%%xmm1                   \n"
     "pshufd    $0x1,%%xmm1,%%xmm2              \n"
     "paddd     %%xmm2,%%xmm1                   \n"
     "paddd     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%1                        \n"
     "jg        1b                              \n"
     "movd      %%xmm0,%3                       \n"
   : "+r"(src),        // %0
@@ -143,9 +139,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     "m"(kHashMul2),   // %7
     "m"(kHashMul3)    // %8
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );  // NOLINT
   return hash;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc
index 99831651f5f..e99009a21df 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/compare_win.cc
@@ -27,13 +27,11 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
-    align      4
   wloop:
-    movdqa     xmm1, [eax]
+    movdqu     xmm1, [eax]
     lea        eax,  [eax + 16]
-    movdqa     xmm2, [edx]
+    movdqu     xmm2, [edx]
     lea        edx,  [edx + 16]
-    sub        ecx, 16
     movdqa     xmm3, xmm1  // abs trick
     psubusb    xmm1, xmm2
     psubusb    xmm2, xmm3
@@ -45,6 +43,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
     pmaddwd    xmm2, xmm2
     paddd      xmm0, xmm1
     paddd      xmm0, xmm2
+    sub        ecx, 16
     jg         wloop
 
     pshufd     xmm1, xmm0, 0xee
@@ -70,12 +69,10 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
 
-    align      4
   wloop:
     vmovdqu    ymm1, [eax]
     vmovdqu    ymm2, [eax + edx]
     lea        eax,  [eax + 32]
-    sub        ecx, 32
     vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
     vpsubusb   ymm2, ymm2, ymm1
     vpor       ymm1, ymm2, ymm3
@@ -85,6 +82,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
     vpmaddwd   ymm1, ymm1, ymm1
     vpaddd     ymm0, ymm0, ymm1
     vpaddd     ymm0, ymm0, ymm2
+    sub        ecx, 32
     jg         wloop
 
     vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
@@ -145,7 +143,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     pxor       xmm7, xmm7        // constant 0 for unpck
     movdqa     xmm6, kHash16x33
 
-    align      4
   wloop:
     movdqu     xmm1, [eax]       // src[0-15]
     lea        eax, [eax + 16]
@@ -170,7 +167,6 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     pmulld(0xcd)                 // pmulld     xmm1, xmm5
     paddd      xmm3, xmm4        // add 16 results
     paddd      xmm1, xmm2
-    sub        ecx, 16
     paddd      xmm1, xmm3
 
     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
@@ -178,6 +174,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     pshufd     xmm2, xmm1, 0x01
     paddd      xmm1, xmm2
     paddd      xmm0, xmm1
+    sub        ecx, 16
     jg         wloop
 
     movd       eax, xmm0         // return hash
@@ -195,7 +192,6 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
     movd       xmm0, [esp + 12]  // seed
     movdqa     xmm6, kHash16x33
 
-    align      4
   wloop:
     vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
     pmulld     xmm0, xmm6  // hash *= 33 ^ 16
@@ -209,13 +205,13 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
     pmulld     xmm1, kHashMul3
     paddd      xmm3, xmm4        // add 16 results
     paddd      xmm1, xmm2
-    sub        ecx, 16
     paddd      xmm1, xmm3
     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
     paddd      xmm1, xmm2
     pshufd     xmm2, xmm1, 0x01
     paddd      xmm1, xmm2
     paddd      xmm0, xmm1
+    sub        ecx, 16
     jg         wloop
 
     movd       eax, xmm0         // return hash
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc
index a8e294f47fc..41696c18f87 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert.cc
@@ -188,17 +188,14 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
                        int width, int height) {
   int y;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
   }
 #endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src, 16) &&
-      IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -207,8 +204,8 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
   }
 #endif
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
 #if defined(HAS_COPYROW_MIPS)
@@ -283,20 +280,15 @@ static int X420ToI420(const uint8* src_y,
     src_stride_uv = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_SPLITUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     SplitUVRow = SplitUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
-          IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-          IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-        SplitUVRow = SplitUVRow_SSE2;
-      }
+      SplitUVRow = SplitUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_SPLITUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     SplitUVRow = SplitUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       SplitUVRow = SplitUVRow_AVX2;
@@ -304,7 +296,7 @@ static int X420ToI420(const uint8* src_y,
   }
 #endif
 #if defined(HAS_SPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     SplitUVRow = SplitUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       SplitUVRow = SplitUVRow_NEON;
@@ -312,15 +304,13 @@ static int X420ToI420(const uint8* src_y,
   }
 #endif
 #if defined(HAS_SPLITUVROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
     SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
-      if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
-          IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
-          IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
-        SplitUVRow = SplitUVRow_MIPS_DSPR2;
-      }
+      SplitUVRow = SplitUVRow_MIPS_DSPR2;
     }
   }
 #endif
@@ -391,125 +381,6 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
                     width, height);
 }
 
-// Convert Q420 to I420.
-// Format is rows of YY/YUYV
-LIBYUV_API
-int Q420ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  int halfheight;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-      int pix) = YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
-      YUY2ToYRow_C;
-  if (!src_y || !src_yuy2 ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    CopyRow = CopyRow_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
-
-#if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
-    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
-    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
-      YUY2ToYRow = YUY2ToYRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    }
-    if (IS_ALIGNED(width, 16)) {
-      YUY2ToYRow = YUY2ToYRow_NEON;
-      YUY2ToUV422Row = YUY2ToUV422Row_NEON;
-    }
-  }
-#endif
-
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-    YUY2ToYRow(src_yuy2, dst_y, width);
-    src_yuy2 += src_stride_yuy2;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    CopyRow(src_y, dst_y, width);
-    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
-  }
-  return 0;
-}
-
 // Convert YUY2 to I420.
 LIBYUV_API
 int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
@@ -529,23 +400,17 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = -src_stride_yuy2;
   }
 #if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
     YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUVRow = YUY2ToUVRow_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
+      YUY2ToUVRow = YUY2ToUVRow_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
     YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -555,11 +420,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
   }
 #endif
 #if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
-    }
+    YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
       YUY2ToUVRow = YUY2ToUVRow_NEON;
@@ -602,23 +465,17 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = -src_stride_uyvy;
   }
 #if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     UYVYToUVRow = UYVYToUVRow_Any_SSE2;
     UYVYToYRow = UYVYToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUVRow = UYVYToUVRow_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          UYVYToYRow = UYVYToYRow_SSE2;
-        }
-      }
+      UYVYToUVRow = UYVYToUVRow_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     UYVYToUVRow = UYVYToUVRow_Any_AVX2;
     UYVYToYRow = UYVYToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -628,11 +485,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
   }
 #endif
 #if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
-    if (width >= 16) {
-      UYVYToUVRow = UYVYToUVRow_Any_NEON;
-    }
+    UYVYToUVRow = UYVYToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
       UYVYToUVRow = UYVYToUVRow_NEON;
@@ -680,23 +535,17 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -706,7 +555,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -714,7 +563,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUVRow = ARGBToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_NEON;
@@ -761,34 +610,31 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
     src_stride_bgra = -src_stride_bgra;
   }
-#if defined(HAS_BGRATOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
     BGRAToYRow = BGRAToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
-      BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
-        BGRAToUVRow = BGRAToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          BGRAToYRow = BGRAToYRow_SSSE3;
-        }
-      }
+      BGRAToUVRow = BGRAToUVRow_SSSE3;
+      BGRAToYRow = BGRAToYRow_SSSE3;
     }
   }
-#elif defined(HAS_BGRATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_BGRATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     BGRAToYRow = BGRAToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       BGRAToYRow = BGRAToYRow_NEON;
     }
-    if (width >= 16) {
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
       BGRAToUVRow = BGRAToUVRow_Any_NEON;
       if (IS_ALIGNED(width, 16)) {
         BGRAToUVRow = BGRAToUVRow_NEON;
       }
     }
-  }
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
@@ -830,32 +676,29 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
     src_abgr = src_abgr + (height - 1) * src_stride_abgr;
     src_stride_abgr = -src_stride_abgr;
   }
-#if defined(HAS_ABGRTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
     ABGRToYRow = ABGRToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
-      ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
-        ABGRToUVRow = ABGRToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ABGRToYRow = ABGRToYRow_SSSE3;
-        }
-      }
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ABGRTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ABGRToYRow = ABGRToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ABGRToYRow = ABGRToYRow_NEON;
     }
-    if (width >= 16) {
-      ABGRToUVRow = ABGRToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ABGRToUVRow = ABGRToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
     }
   }
 #endif
@@ -899,32 +742,29 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
     src_rgba = src_rgba + (height - 1) * src_stride_rgba;
     src_stride_rgba = -src_stride_rgba;
   }
-#if defined(HAS_RGBATOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
     RGBAToYRow = RGBAToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
-      RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
-        RGBAToUVRow = RGBAToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          RGBAToYRow = RGBAToYRow_SSSE3;
-        }
-      }
+      RGBAToUVRow = RGBAToUVRow_SSSE3;
+      RGBAToYRow = RGBAToYRow_SSSE3;
     }
   }
-#elif defined(HAS_RGBATOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGBATOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGBAToYRow = RGBAToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGBAToYRow = RGBAToYRow_NEON;
     }
-    if (width >= 16) {
-      RGBAToUVRow = RGBAToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGBAToUVRow = RGBAToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGBAToUVRow = RGBAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_NEON;
     }
   }
 #endif
@@ -978,22 +818,23 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
   }
 
 #if defined(HAS_RGB24TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGB24ToYRow = RGB24ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB24ToYRow = RGB24ToYRow_NEON;
     }
-    if (width >= 16) {
-      RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB24ToUVRow = RGB24ToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_RGB24TOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToUVRow = RGB24ToUVRow_NEON;
     }
   }
-#else  // HAS_RGB24TOYROW_NEON
-
+#endif
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
@@ -1001,7 +842,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1009,17 +850,13 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif  // HAS_ARGBTOUVROW_SSSE3
-#endif  // HAS_RGB24TOYROW_NEON
 
   {
 #if !defined(HAS_RGB24TOYROW_NEON)
@@ -1095,22 +932,23 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
   }
 
 #if defined(HAS_RAWTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     RAWToYRow = RAWToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RAWToYRow = RAWToYRow_NEON;
     }
-    if (width >= 16) {
-      RAWToUVRow = RAWToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RAWToUVRow = RAWToUVRow_NEON;
-      }
+  }
+#endif
+#if defined(HAS_RAWTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RAWToUVRow = RAWToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToUVRow = RAWToUVRow_NEON;
     }
   }
-#else  // HAS_RAWTOYROW_NEON
-
+#endif
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RAWToARGBRow = RAWToARGBRow_SSSE3;
@@ -1118,7 +956,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1126,17 +964,13 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif  // HAS_ARGBTOUVROW_SSSE3
-#endif  // HAS_RAWTOYROW_NEON
 
   {
     // Allocate 2 rows of ARGB.
@@ -1210,22 +1044,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
   }
 
 #if defined(HAS_RGB565TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGB565ToYRow = RGB565ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToYRow = RGB565ToYRow_NEON;
     }
-    if (width >= 16) {
-      RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        RGB565ToUVRow = RGB565ToUVRow_NEON;
-      }
+    RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToUVRow = RGB565ToUVRow_NEON;
     }
   }
 #else  // HAS_RGB565TOYROW_NEON
 
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
@@ -1233,7 +1065,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1241,13 +1073,10 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif  // HAS_ARGBTOUVROW_SSSE3
@@ -1327,22 +1156,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
   }
 
 #if defined(HAS_ARGB1555TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToYRow = ARGB1555ToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
-      }
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
     }
   }
 #else  // HAS_ARGB1555TOYROW_NEON
 
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
@@ -1350,7 +1177,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1358,13 +1185,10 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif  // HAS_ARGBTOUVROW_SSSE3
@@ -1445,22 +1269,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
   }
 
 #if defined(HAS_ARGB4444TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToYRow = ARGB4444ToYRow_NEON;
     }
-    if (width >= 16) {
-      ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
-      }
+    ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
     }
   }
 #else  // HAS_ARGB4444TOYROW_NEON
 
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
@@ -1468,7 +1290,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_SSSE3;
@@ -1476,13 +1298,10 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif  // HAS_ARGBTOUVROW_SSSE3
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc
index ac0bc3d156f..66f7660793a 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_argb.cc
@@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"
 
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
@@ -79,17 +78,15 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
 #if defined(HAS_I444TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I444ToARGBRow = I444ToARGBRow_SSSE3;
-      }
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_I444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I444ToARGBRow = I444ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I444ToARGBRow = I444ToARGBRow_NEON;
@@ -141,18 +138,15 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToARGBRow = I422ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
       I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -160,7 +154,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGBRow = I422ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGBRow = I422ToARGBRow_NEON;
@@ -221,17 +215,15 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
 #if defined(HAS_I411TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I411ToARGBRow = I411ToARGBRow_SSSE3;
-      }
+      I411ToARGBRow = I411ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_I411TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I411TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I411ToARGBRow = I411ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I411ToARGBRow = I411ToARGBRow_NEON;
@@ -276,15 +268,23 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
     src_stride_y = dst_stride_argb = 0;
   }
 #if defined(HAS_YTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     YToARGBRow = YToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       YToARGBRow = YToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_YTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_YTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YToARGBRow = YToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      YToARGBRow = YToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     YToARGBRow = YToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       YToARGBRow = YToARGBRow_NEON;
@@ -326,17 +326,15 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     src_stride_y = dst_stride_argb = 0;
   }
 #if defined(HAS_I400TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I400ToARGBRow = I400ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
-      I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I400ToARGBRow = I400ToARGBRow_SSE2;
-      }
+      I400ToARGBRow = I400ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_I400TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I400TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I400ToARGBRow = I400ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I400ToARGBRow = I400ToARGBRow_NEON;
@@ -447,15 +445,15 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     src_stride_rgb24 = dst_stride_argb = 0;
   }
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_RGB24TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGB24TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB24ToARGBRow = RGB24ToARGBRow_NEON;
@@ -497,15 +495,15 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     src_stride_raw = dst_stride_argb = 0;
   }
 #if defined(HAS_RAWTOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       RAWToARGBRow = RAWToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_RAWTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RAWTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RAWToARGBRow = RAWToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RAWToARGBRow = RAWToARGBRow_NEON;
@@ -547,15 +545,15 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     src_stride_rgb565 = dst_stride_argb = 0;
   }
 #if defined(HAS_RGB565TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_RGB565TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_RGB565TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       RGB565ToARGBRow = RGB565ToARGBRow_NEON;
@@ -597,15 +595,15 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     src_stride_argb1555 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_ARGB1555TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGB1555TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
@@ -647,15 +645,15 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     src_stride_argb4444 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGB4444TOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
     }
   }
-#elif defined(HAS_ARGB4444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGB4444TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
@@ -693,17 +691,23 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-      }
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -744,18 +748,23 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV21TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV21ToARGBRow = NV21ToARGBRow_SSSE3;
-      }
+      NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToARGBRow = NV21ToARGBRow_AVX2;
     }
   }
 #endif
 #if defined(HAS_NV21TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV21ToARGBRow = NV21ToARGBRow_NEON;
@@ -795,17 +804,23 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_NV12TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        NV12ToARGBRow = NV12ToARGBRow_SSSE3;
-      }
+      NV12ToARGBRow = NV12ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_NV12TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToARGBRow = NV12ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV12ToARGBRow = NV12ToARGBRow_NEON;
@@ -852,19 +867,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = dst_stride_argb = 0;
   }
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
-  // Posix is 16, Windows is 8.
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
-      }
+      YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_YUY2TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       YUY2ToARGBRow = YUY2ToARGBRow_NEON;
@@ -905,19 +924,23 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = dst_stride_argb = 0;
   }
 #if defined(HAS_UYVYTOARGBROW_SSSE3)
-  // Posix is 16, Windows is 8.
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        UYVYToARGBRow = UYVYToARGBRow_SSSE3;
-      }
+      UYVYToARGBRow = UYVYToARGBRow_SSSE3;
     }
   }
-#elif defined(HAS_UYVYTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToARGBRow = UYVYToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_UYVYTOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       UYVYToARGBRow = UYVYToARGBRow_NEON;
@@ -932,6 +955,152 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
   return 0;
 }
 
+// Convert J420 to ARGB.
+LIBYUV_API
+int J420ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert J422 to ARGB.
+LIBYUV_API
+int J422ToARGB(const uint8* src_y, int src_stride_y,
+               const uint8* src_u, int src_stride_u,
+               const uint8* src_v, int src_stride_v,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height) {
+  int y;
+  void (*J422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = J422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v ||
+      !dst_argb ||
+      width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      src_stride_u * 2 == width &&
+      src_stride_v * 2 == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+  }
+#if defined(HAS_J422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    J422ToARGBRow = J422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    J422ToARGBRow = J422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      J422ToARGBRow = J422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    J422ToARGBRow = J422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      J422ToARGBRow = J422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_J422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    J422ToARGBRow = J422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    J422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+    dst_argb += dst_stride_argb;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc
index c1a2f62f020..b743cde264b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from.cc
@@ -13,7 +13,6 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/convert.h"  // For I420Copy
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/scale.h"  // For ScalePlane()
@@ -174,14 +173,15 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
   }
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
     }
   }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -220,14 +220,15 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
     }
   }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -280,14 +281,15 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
   }
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
     }
   }
-#elif defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -326,14 +328,15 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     dst_stride_uyvy = -dst_stride_uyvy;
   }
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
     }
   }
-#elif defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -397,20 +400,15 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
     src_stride_u = src_stride_v = dst_stride_uv = 0;
   }
 #if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-          IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-          IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_SSE2;
-      }
+      MergeUVRow_ = MergeUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       MergeUVRow_ = MergeUVRow_AVX2;
@@ -418,7 +416,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_NEON;
@@ -476,18 +474,15 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToARGBRow = I422ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
       I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -495,7 +490,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGBRow = I422ToARGBRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGBRow = I422ToARGBRow_NEON;
@@ -548,23 +543,30 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
     dst_stride_bgra = -dst_stride_bgra;
   }
 #if defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
-        I422ToBGRARow = I422ToBGRARow_SSSE3;
-      }
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToBGRARow = I422ToBGRARow_AVX2;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToBGRARow = I422ToBGRARow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToBGRARow = I422ToBGRARow_NEON;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@@ -610,17 +612,23 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
     dst_stride_abgr = -dst_stride_abgr;
   }
 #if defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
-        I422ToABGRRow = I422ToABGRRow_SSSE3;
-      }
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
     }
   }
-#elif defined(HAS_I422TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOABGRROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToABGRRow = I422ToABGRRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToABGRRow = I422ToABGRRow_NEON;
@@ -664,17 +672,23 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y,
     dst_stride_rgba = -dst_stride_rgba;
   }
 #if defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-        I422ToRGBARow = I422ToRGBARow_SSSE3;
-      }
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
     }
   }
-#elif defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGBAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGBARow = I422ToRGBARow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGBARow = I422ToRGBARow_NEON;
@@ -718,14 +732,15 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y,
     dst_stride_rgb24 = -dst_stride_rgb24;
   }
 #if defined(HAS_I422TORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB24Row = I422ToRGB24Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB24Row = I422ToRGB24Row_NEON;
@@ -769,14 +784,15 @@ int I420ToRAW(const uint8* src_y, int src_stride_y,
     dst_stride_raw = -dst_stride_raw;
   }
 #if defined(HAS_I422TORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToRAWRow = I422ToRAWRow_SSSE3;
     }
   }
-#elif defined(HAS_I422TORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRAWRow = I422ToRAWRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRAWRow = I422ToRAWRow_NEON;
@@ -820,14 +836,23 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
     dst_stride_argb1555 = -dst_stride_argb1555;
   }
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB1555Row = I422ToARGB1555Row_NEON;
@@ -872,14 +897,23 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
     dst_stride_argb4444 = -dst_stride_argb4444;
   }
 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToARGB4444Row = I422ToARGB4444Row_NEON;
@@ -923,14 +957,23 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_I422TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB565Row = I422ToRGB565Row_SSSE3;
     }
   }
-#elif defined(HAS_I422TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       I422ToRGB565Row = I422ToRGB565Row_NEON;
@@ -1054,38 +1097,6 @@ int ConvertFromI420(const uint8* y, int y_stride,
                      dst_sample_stride ? dst_sample_stride : width * 4,
                      width, height);
       break;
-    case FOURCC_BGGR:
-      r = I420ToBayerBGGR(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_GBRG:
-      r = I420ToBayerGBRG(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_GRBG:
-      r = I420ToBayerGRBG(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
-    case FOURCC_RGGB:
-      r = I420ToBayerRGGB(y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          dst_sample,
-                          dst_sample_stride ? dst_sample_stride : width,
-                          width, height);
-      break;
     case FOURCC_I400:
       r = I400Copy(y, y_stride,
                    dst_sample,
@@ -1116,7 +1127,7 @@ int ConvertFromI420(const uint8* y, int y_stride,
                      width, height);
       break;
     }
-    // TODO(fbarchard): Add M420 and Q420.
+    // TODO(fbarchard): Add M420.
     // Triplanar formats
     // TODO(fbarchard): halfstride instead of halfwidth
     case FOURCC_I420:
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
index de461ddb046..dc2186a6a08 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_from_argb.cc
@@ -12,7 +12,6 @@
 
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/row.h"
 
@@ -51,17 +50,15 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+    if (TestCpuFlag(kCpuHasSSSE3)) {
       ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
       if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
-        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-          ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-        }
+        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
       }
   }
-#elif defined(HAS_ARGBTOUV444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOUV444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToUV444Row = ARGBToUV444Row_NEON;
@@ -69,19 +66,16 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -130,17 +124,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-      }
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUV422Row = ARGBToUV422Row_NEON;
@@ -149,18 +141,15 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
 #endif
 
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -209,19 +198,15 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYRow = ARGBToYRow_AVX2;
@@ -229,7 +214,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -237,7 +222,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOUV411ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 32) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
     if (IS_ALIGNED(width, 32)) {
       ARGBToUV411Row = ARGBToUV411Row_NEON;
@@ -281,22 +266,17 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -304,7 +284,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUVRow = ARGBToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_NEON;
@@ -312,18 +292,15 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_SSE2;
-      }
+      MergeUVRow_ = MergeUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       MergeUVRow_ = MergeUVRow_AVX2;
@@ -331,7 +308,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_NEON;
@@ -388,22 +365,17 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVRow = ARGBToUVRow_SSSE3;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          ARGBToYRow = ARGBToYRow_SSSE3;
-        }
-      }
+      ARGBToUVRow = ARGBToUVRow_SSSE3;
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -411,7 +383,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUVRow = ARGBToUVRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVRow = ARGBToUVRow_NEON;
@@ -419,18 +391,15 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_MERGEUVROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
     if (IS_ALIGNED(halfwidth, 16)) {
-      MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
-        MergeUVRow_ = MergeUVRow_SSE2;
-      }
+      MergeUVRow_ = MergeUVRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     MergeUVRow_ = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       MergeUVRow_ = MergeUVRow_AVX2;
@@ -438,7 +407,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     MergeUVRow_ = MergeUVRow_Any_NEON;
     if (IS_ALIGNED(halfwidth, 16)) {
       MergeUVRow_ = MergeUVRow_NEON;
@@ -500,17 +469,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_yuy2 = 0;
   }
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-      }
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUV422Row = ARGBToUV422Row_NEON;
@@ -518,17 +485,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -537,14 +502,15 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
 #endif
 
 #if defined(HAS_I422TOYUY2ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_SSE2;
     }
   }
-#elif defined(HAS_I422TOYUY2ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOYUY2ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToYUY2Row = I422ToYUY2Row_NEON;
@@ -602,17 +568,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_uyvy = 0;
   }
 #if defined(HAS_ARGBTOUV422ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUV422Row = ARGBToUV422Row_SSSE3;
-      }
+      ARGBToUV422Row = ARGBToUV422Row_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOUV422ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_ARGBTOUV422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUV422Row = ARGBToUV422Row_NEON;
@@ -620,17 +584,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -639,14 +601,15 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
 #endif
 
 #if defined(HAS_I422TOUYVYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_SSE2;
     }
   }
-#elif defined(HAS_I422TOUYVYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+#endif
+#if defined(HAS_I422TOUYVYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       I422ToUYVYRow = I422ToUYVYRow_NEON;
@@ -697,19 +660,15 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_y = 0;
   }
 #if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
+      ARGBToYRow = ARGBToYRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYRow = ARGBToYRow_AVX2;
@@ -717,7 +676,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYRow = ARGBToYRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYRow = ARGBToYRow_NEON;
@@ -773,14 +732,15 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_rgb24 = 0;
   }
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTORGB24ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToRGB24Row = ARGBToRGB24Row_NEON;
@@ -820,14 +780,15 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_raw = 0;
   }
 #if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToRAWRow = ARGBToRAWRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTORAWROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORAWROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToRAWRow = ARGBToRAWRow_NEON;
@@ -843,6 +804,46 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+static const uint8 kDither8x8[64] = {
+  0, 128, 32, 160,  8, 136, 40, 168,
+  192, 64, 224, 96, 200, 72, 232, 104,
+  48, 176, 16, 144, 56, 184, 24, 152,
+  240, 112, 208, 80, 248, 120, 216, 88,
+  12, 140, 44, 172,  4, 132, 36, 164,
+  204, 76, 236, 108, 196, 68, 228, 100,
+  60, 188, 28, 156, 52, 180, 20, 148,
+  252, 124, 220, 92, 244, 116, 212, 84,
+};
+
+// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes).
+LIBYUV_API
+int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_rgb565, int dst_stride_rgb565,
+                       const uint8* dither8x8, int width, int height) {
+  int y;
+  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
+      const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C;
+  if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (!dither8x8) {
+    dither8x8 = kDither8x8;
+
+  }
+  for (y = 0; y < height; ++y) {
+    ARGBToRGB565DitherRow(src_argb, dst_rgb565,
+                          dither8x8 + ((y & 7) << 3), width);
+    src_argb += src_stride_argb;
+    dst_rgb565 += dst_stride_rgb565;
+  }
+  return 0;
+}
+
 // Convert ARGB To RGB565.
 LIBYUV_API
 int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
@@ -867,15 +868,23 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_rgb565 = 0;
   }
 #if defined(HAS_ARGBTORGB565ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
     }
   }
-#elif defined(HAS_ARGBTORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToRGB565Row = ARGBToRGB565Row_NEON;
@@ -915,15 +924,23 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb1555 = 0;
   }
 #if defined(HAS_ARGBTOARGB1555ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
     }
   }
-#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB1555ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
@@ -963,15 +980,23 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb4444 = 0;
   }
 #if defined(HAS_ARGBTOARGB4444ROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
     }
   }
-#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
@@ -1011,23 +1036,17 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
 #if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
-      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-        ARGBToUVJRow = ARGBToUVJRow_SSSE3;
-        if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
-          ARGBToYJRow = ARGBToYJRow_SSSE3;
-        }
-      }
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYJRow = ARGBToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYJRow = ARGBToYJRow_AVX2;
@@ -1035,7 +1054,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
@@ -1043,7 +1062,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOUVJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       ARGBToUVJRow = ARGBToUVJRow_NEON;
@@ -1067,6 +1086,80 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// ARGB little endian (bgra in memory) to J422
+LIBYUV_API
+int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height) {
+  int y;
+  void (*ARGBToUVJ422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+      int pix) = ARGBToUVJ422Row_C;
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+      ARGBToYJRow_C;
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_y == width &&
+      dst_stride_u * 2 == width &&
+      dst_stride_v * 2 == width) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+  }
+#if defined(HAS_ARGBTOUVJ422ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJ422ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToUVJ422Row = ARGBToUVJ422Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJ422Row = ARGBToUVJ422Row_NEON;
+    }
+  }
+#endif
+
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToYJRow = ARGBToYJRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    ARGBToUVJ422Row(src_argb, dst_u, dst_v, width);
+    ARGBToYJRow(src_argb, dst_y, width);
+    src_argb += src_stride_argb;
+    dst_y += dst_stride_y;
+    dst_u += dst_stride_u;
+    dst_v += dst_stride_v;
+  }
+  return 0;
+}
+
 // Convert ARGB to J400.
 LIBYUV_API
 int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
@@ -1091,19 +1184,15 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_yj = 0;
   }
 #if defined(HAS_ARGBTOYJROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-          IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
-        ARGBToYJRow = ARGBToYJRow_SSSE3;
-      }
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBTOYJROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBToYJRow = ARGBToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYJRow = ARGBToYJRow_AVX2;
@@ -1111,7 +1200,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYJROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToYJRow = ARGBToYJRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToYJRow = ARGBToYJRow_NEON;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc
index 1b228a7b4d9..af829fbd32b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_argb.cc
@@ -11,7 +11,6 @@
 #include "libyuv/convert_argb.h"
 
 #include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
 #ifdef HAVE_JPEG
 #include "libyuv/mjpeg_decoder.h"
 #endif
@@ -144,36 +143,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                          crop_argb, argb_stride,
                          crop_width, inv_crop_height);
       break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToARGB(src, src_width,
-                          crop_argb, argb_stride,
-                          crop_width, inv_crop_height);
-      break;
-
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
       r = I400ToARGB(src, src_width,
@@ -205,15 +174,6 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                      crop_argb, argb_stride,
                      crop_width, inv_crop_height);
       break;
-//    case FOURCC_Q420:
-//      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-//      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-//               src_width + crop_x * 2;
-//      r = Q420ToARGB(src, src_width * 3,
-//                    src_uv, src_width * 3,
-//                    crop_argb, argb_stride,
-//                    crop_width, inv_crop_height);
-//      break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YU12:
@@ -241,6 +201,25 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                      crop_width, inv_crop_height);
       break;
     }
+
+    case FOURCC_J420: {
+      const uint8* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8* src_u;
+      const uint8* src_v;
+      int halfwidth = (src_width + 1) / 2;
+      int halfheight = (abs_src_height + 1) / 2;
+      src_u = sample + src_width * abs_src_height +
+          (halfwidth * crop_y + crop_x) / 2;
+      src_v = sample + src_width * abs_src_height +
+          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width,
+                     src_u, halfwidth,
+                     src_v, halfwidth,
+                     crop_argb, argb_stride,
+                     crop_width, inv_crop_height);
+      break;
+    }
+
     case FOURCC_I422:
     case FOURCC_YV16: {
       const uint8* src_y = sample + src_width * crop_y + crop_x;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc
index 7b194fff721..5e75369b55a 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/convert_to_i420.cc
@@ -12,7 +12,6 @@
 
 #include "libyuv/convert.h"
 
-#include "libyuv/format_conversion.h"
 #include "libyuv/video_common.h"
 
 #ifdef __cplusplus
@@ -173,40 +172,6 @@ int ConvertToI420(const uint8* sample,
                      v, v_stride,
                      crop_width, inv_crop_height);
       break;
-    // TODO(fbarchard): Support cropping Bayer by odd numbers
-    // by adjusting fourcc.
-    case FOURCC_BGGR:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerBGGRToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_GBRG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGBRGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_GRBG:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerGRBGToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
-    case FOURCC_RGGB:
-      src = sample + (src_width * crop_y + crop_x);
-      r = BayerRGGBToI420(src, src_width,
-                          y, y_stride,
-                          u, u_stride,
-                          v, v_stride,
-                          crop_width, inv_crop_height);
-      break;
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
       r = I400ToI420(src, src_width,
@@ -218,7 +183,8 @@ int ConvertToI420(const uint8* sample,
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
       r = NV12ToI420Rotate(src, src_width,
                            src_uv, aligned_src_width,
                            y, y_stride,
@@ -228,7 +194,8 @@ int ConvertToI420(const uint8* sample,
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + (src_width * src_height) +
+        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
       // Call NV12 but with u and v parameters swapped.
       r = NV12ToI420Rotate(src, src_width,
                            src_uv, aligned_src_width,
@@ -245,17 +212,6 @@ int ConvertToI420(const uint8* sample,
                      v, v_stride,
                      crop_width, inv_crop_height);
       break;
-    case FOURCC_Q420:
-      src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
-      src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
-               src_width + crop_x * 2;
-      r = Q420ToI420(src, src_width * 3,
-                    src_uv, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    crop_width, inv_crop_height);
-      break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YU12:
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc
index 8f8a403ee3e..1efa2652581 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/cpu_id.cc
@@ -52,7 +52,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
 #if defined(_MSC_VER) && !defined(__clang__)
 #if (_MSC_FULL_VER >= 160040219)
   __cpuidex((int*)(cpu_info), info_eax, info_ecx);
-#elif defined(_M_IX86)
+#endif
+#if defined(_M_IX86)
   __asm {
     mov        eax, info_eax
     mov        ecx, info_ecx
@@ -98,13 +99,15 @@ int TestOsSaveYmm() {
   uint32 xcr0 = 0u;
 #if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
   xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
-#elif defined(_M_IX86) && defined(_MSC_VER)
+#endif
+#if defined(_M_IX86) && defined(_MSC_VER)
   __asm {
     xor        ecx, ecx    // xcr 0
     _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
     mov        xcr0, eax
   }
-#elif defined(__i386__) || defined(__x86_64__)
+#endif
+#if defined(__i386__) || defined(__x86_64__)
   asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
 #endif  // defined(_MSC_VER)
   return((xcr0 & 6) == 6);  // Is ymm saved?
@@ -135,6 +138,12 @@ int ArmCpuCaps(const char* cpuinfo_name) {
         fclose(f);
         return kCpuHasNEON;
       }
+      // aarch64 uses asimd for Neon.
+      p = strstr(cpuinfo_line, " asimd");
+      if (p && (p[6] == ' ' || p[6] == '\n')) {
+        fclose(f);
+        return kCpuHasNEON;
+      }
     }
   }
   fclose(f);
@@ -240,7 +249,8 @@ int InitCpuFlags(void) {
   if (TestEnv("LIBYUV_DISABLE_FMA3")) {
     cpu_info_ &= ~kCpuHasFMA3;
   }
-#elif defined(__mips__) && defined(__linux__)
+#endif
+#if defined(__mips__) && defined(__linux__)
   // Linux mips parse text file for dsp detect.
   cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
 #if defined(__mips_dspr2)
@@ -257,7 +267,8 @@ int InitCpuFlags(void) {
   if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
     cpu_info_ &= ~kCpuHasMIPS_DSPR2;
   }
-#elif defined(__arm__) || defined(__aarch64__)
+#endif
+#if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
 // __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.
 // For Linux, /proc/cpuinfo can be tested but without that assume Neon.
@@ -266,7 +277,8 @@ int InitCpuFlags(void) {
 // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon
 // flag in it.
 // So for aarch64, neon enabling is hard coded here.
-#elif defined(__aarch64__)
+#endif
+#if defined(__aarch64__)
   cpu_info_ = kCpuHasNEON;
 #else
   // Linux arm parse text file for neon detect.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/format_conversion.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/format_conversion.cc
deleted file mode 100644
index 3c173715320..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/format_conversion.cc
+++ /dev/null
@@ -1,554 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/format_conversion.h"
-
-#include "libyuv/basic_types.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/video_common.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// generate a selector mask useful for pshufb
-static uint32 GenerateSelector(int select0, int select1) {
-  return (uint32)(select0) |
-         (uint32)((select1 + 4) << 8) |
-         (uint32)((select0 + 8) << 16) |
-         (uint32)((select1 + 12) << 24);
-}
-
-static int MakeSelectors(const int blue_index,
-                         const int green_index,
-                         const int red_index,
-                         uint32 dst_fourcc_bayer,
-                         uint32* index_map) {
-  // Now build a lookup table containing the indices for the four pixels in each
-  // 2x2 Bayer grid.
-  switch (dst_fourcc_bayer) {
-    case FOURCC_BGGR:
-      index_map[0] = GenerateSelector(blue_index, green_index);
-      index_map[1] = GenerateSelector(green_index, red_index);
-      break;
-    case FOURCC_GBRG:
-      index_map[0] = GenerateSelector(green_index, blue_index);
-      index_map[1] = GenerateSelector(red_index, green_index);
-      break;
-    case FOURCC_RGGB:
-      index_map[0] = GenerateSelector(red_index, green_index);
-      index_map[1] = GenerateSelector(green_index, blue_index);
-      break;
-    case FOURCC_GRBG:
-      index_map[0] = GenerateSelector(green_index, red_index);
-      index_map[1] = GenerateSelector(blue_index, green_index);
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-  return 0;
-}
-
-// Converts 32 bit ARGB to Bayer RGB formats.
-LIBYUV_API
-int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  int y;
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-    }
-  }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
-    }
-  }
-#endif
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-
-  for (y = 0; y < height; ++y) {
-    ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
-    src_argb += src_stride_argb;
-    dst_bayer += dst_stride_bayer;
-  }
-  return 0;
-}
-
-#define AVG(a, b) (((a) + (b)) >> 1)
-
-static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 r = src_bayer1[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer0[0];
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = AVG(r, src_bayer1[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    r = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer0[0];
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = AVG(r, src_bayer1[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[0];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer1[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 g = src_bayer0[1];
-  uint8 b = src_bayer1[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer1[1]);
-    dst_argb[1] = AVG(g, src_bayer0[1]);
-    dst_argb[2] = src_bayer0[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[7] = 255U;
-    g = src_bayer0[1];
-    b = src_bayer1[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer1[1]);
-  dst_argb[1] = AVG(g, src_bayer0[1]);
-  dst_argb[2] = src_bayer0[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[1];
-    dst_argb[5] = src_bayer0[1];
-    dst_argb[6] = src_bayer0[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 b = src_bayer0[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = AVG(b, src_bayer0[1]);
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = src_bayer1[0];
-    dst_argb[3] = 255U;
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[7] = 255U;
-    b = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = AVG(b, src_bayer0[1]);
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = src_bayer1[0];
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer0[1];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer1[0];
-    dst_argb[7] = 255U;
-  }
-}
-
-static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
-                       uint8* dst_argb, int pix) {
-  const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
-  uint8 r = src_bayer0[1];
-  int x;
-  for (x = 0; x < pix - 2; x += 2) {
-    dst_argb[0] = src_bayer1[0];
-    dst_argb[1] = src_bayer0[0];
-    dst_argb[2] = AVG(r, src_bayer0[1]);
-    dst_argb[3] = 255U;
-    dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
-    dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-    r = src_bayer0[1];
-    src_bayer0 += 2;
-    src_bayer1 += 2;
-    dst_argb += 8;
-  }
-  dst_argb[0] = src_bayer1[0];
-  dst_argb[1] = src_bayer0[0];
-  dst_argb[2] = AVG(r, src_bayer0[1]);
-  dst_argb[3] = 255U;
-  if (!(pix & 1)) {
-    dst_argb[4] = src_bayer1[0];
-    dst_argb[5] = src_bayer0[0];
-    dst_argb[6] = src_bayer0[1];
-    dst_argb[7] = 255U;
-  }
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  int y;
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  if (height < 0) {
-    height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
-  }
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;    // Bad FourCC
-  }
-
-  for (y = 0; y < height - 1; y += 2) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
-    BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-              dst_argb + dst_stride_argb, width);
-    src_bayer += src_stride_bayer * 2;
-    dst_argb += dst_stride_argb * 2;
-  }
-  if (height & 1) {
-    BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
-  }
-  return 0;
-}
-
-// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
-int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height,
-                uint32 src_fourcc_bayer) {
-  void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-  void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
-                    uint8* dst_argb, int pix);
-
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
-      ARGBToYRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    int halfheight;
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (halfheight - 1) * dst_stride_u;
-    dst_v = dst_v + (halfheight - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-    ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
-      ARGBToUVRow = ARGBToUVRow_SSSE3;
-      if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-        ARGBToYRow = ARGBToYRow_SSSE3;
-      }
-    }
-  }
-#elif defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
-    ARGBToUVRow = ARGBToUVRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
-      ARGBToUVRow = ARGBToUVRow_NEON;
-    }
-  }
-#endif
-
-  switch (src_fourcc_bayer) {
-    case FOURCC_BGGR:
-      BayerRow0 = BayerRowBG;
-      BayerRow1 = BayerRowGR;
-      break;
-    case FOURCC_GBRG:
-      BayerRow0 = BayerRowGB;
-      BayerRow1 = BayerRowRG;
-      break;
-    case FOURCC_GRBG:
-      BayerRow0 = BayerRowGR;
-      BayerRow1 = BayerRowBG;
-      break;
-    case FOURCC_RGGB:
-      BayerRow0 = BayerRowRG;
-      BayerRow1 = BayerRowGB;
-      break;
-    default:
-      return -1;  // Bad FourCC
-  }
-
-  {
-    // Allocate 2 rows of ARGB.
-    const int kRowSize = (width * 4 + 15) & ~15;
-    align_buffer_64(row, kRowSize * 2);
-    int y;
-    for (y = 0; y < height - 1; y += 2) {
-      BayerRow0(src_bayer, src_stride_bayer, row, width);
-      BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
-                row + kRowSize, width);
-      ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-      ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
-      src_bayer += src_stride_bayer * 2;
-      dst_y += dst_stride_y * 2;
-      dst_u += dst_stride_u;
-      dst_v += dst_stride_v;
-    }
-    if (height & 1) {
-      BayerRow0(src_bayer, src_stride_bayer, row, width);
-      ARGBToUVRow(row, 0, dst_u, dst_v, width);
-      ARGBToYRow(row, dst_y, width);
-    }
-    free_aligned_buffer_64(row);
-  }
-  return 0;
-}
-
-// Convert I420 to Bayer.
-LIBYUV_API
-int I420ToBayer(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_bayer, int dst_stride_bayer,
-                int width, int height,
-                uint32 dst_fourcc_bayer) {
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-  const int blue_index = 0;  // Offsets for ARGB format
-  const int green_index = 1;
-  const int red_index = 2;
-  uint32 index_map[2];
-  // Negative height means invert the image.
-  if (height < 0) {
-    int halfheight;
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-#if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
-    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
-    if (IS_ALIGNED(width, 16)) {
-      I422ToARGBRow = I422ToARGBRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    I422ToARGBRow = I422ToARGBRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
-    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
-  }
-#endif
-
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-    }
-  }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
-    }
-  }
-#endif
-
-  if (MakeSelectors(blue_index, green_index, red_index,
-                    dst_fourcc_bayer, index_map)) {
-    return -1;  // Bad FourCC
-  }
-  {
-    // Allocate a row of ARGB.
-    align_buffer_64(row, width * 4);
-    int y;
-    for (y = 0; y < height; ++y) {
-      I422ToARGBRow(src_y, src_u, src_v, row, width);
-      ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
-      dst_bayer += dst_stride_bayer;
-      src_y += src_stride_y;
-      if (y & 1) {
-        src_u += src_stride_u;
-        src_v += src_stride_v;
-      }
-    }
-    free_aligned_buffer_64(row);
-  }
-  return 0;
-}
-
-#define MAKEBAYERFOURCC(BAYER)                                                 \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_y, int dst_stride_y,                       \
-                         uint8* dst_u, int dst_stride_u,                       \
-                         uint8* dst_v, int dst_stride_v,                       \
-                         int width, int height) {                              \
-  return BayerToI420(src_bayer, src_stride_bayer,                              \
-                     dst_y, dst_stride_y,                                      \
-                     dst_u, dst_stride_u,                                      \
-                     dst_v, dst_stride_v,                                      \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y,                   \
-                       const uint8* src_u, int src_stride_u,                   \
-                       const uint8* src_v, int src_stride_v,                   \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return I420ToBayer(src_y, src_stride_y,                                      \
-                     src_u, src_stride_u,                                      \
-                     src_v, src_stride_v,                                      \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb,             \
-                       uint8* dst_bayer, int dst_stride_bayer,                 \
-                       int width, int height) {                                \
-  return ARGBToBayer(src_argb, src_stride_argb,                                \
-                     dst_bayer, dst_stride_bayer,                              \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}                                                                              \
-                                                                               \
-LIBYUV_API                                                                     \
-int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer,         \
-                         uint8* dst_argb, int dst_stride_argb,                 \
-                         int width, int height) {                              \
-  return BayerToARGB(src_bayer, src_stride_bayer,                              \
-                     dst_argb, dst_stride_argb,                                \
-                     width, height,                                            \
-                     FOURCC_##BAYER);                                          \
-}
-
-MAKEBAYERFOURCC(BGGR)
-MAKEBAYERFOURCC(GBRG)
-MAKEBAYERFOURCC(GRBG)
-MAKEBAYERFOURCC(RGGB)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc
index 23d22d099bb..40ce2f787a1 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/mjpeg_validate.cc
@@ -10,15 +10,66 @@
 
 #include "libyuv/mjpeg_decoder.h"
 
+#include <string.h>  // For memchr.
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// Enable this to try scasb implementation.
+// #define ENABLE_SCASB 1
+
+#ifdef ENABLE_SCASB
+
+// Multiple of 1.
+__declspec(naked) __declspec(align(16))
+const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // src
+    mov        eax, [esp + 8]   // val
+    mov        ecx, [esp + 12]  // count
+    repne scasb
+    jne        sr99
+    mov        eax, edi
+    sub        eax, 1
+    mov        edi, edx
+    ret
+
+  sr99:
+    mov        eax, 0
+    mov        edi, edx
+    ret
+  }
+}
+#endif
+
+// Helper function to scan for EOI marker.
+static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+  const uint8* end = sample + sample_size - 1;
+  const uint8* it = sample;
+  for (;;) {
+#ifdef ENABLE_SCASB
+    it = ScanRow_ERMS(it, 0xff, end - it);
+#else
+    it = static_cast<const uint8*>(memchr(it, 0xff, end - it));
+#endif
+    if (it == NULL) {
+      break;
+    }
+    if (it[1] == 0xd9) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
+    }
+    ++it;  // Skip over current 0xff.
+  }
+  // ERROR: Invalid jpeg end code not found. Size sample_size
+  return LIBYUV_FALSE;
+}
+
 // Helper function to validate the jpeg appears intact.
-// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
 LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
-  size_t i;
+  const size_t kBackSearchSize = 1024;
   if (sample_size < 64) {
     // ERROR: Invalid jpeg size: sample_size
     return LIBYUV_FALSE;
@@ -27,17 +78,20 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
     // ERROR: Invalid jpeg initial start code
     return LIBYUV_FALSE;
   }
-  for (i = sample_size - 2; i > 1;) {
-    if (sample[i] != 0xd9) {
-      if (sample[i] == 0xff && sample[i + 1] == 0xd9) {  // End Of Image
-        return LIBYUV_TRUE;  // Success: Valid jpeg.
-      }
-      --i;
+  // Step over SOI marker.
+  sample += 2;
+  sample_size -= 2;
+
+  // Look for the End Of Image (EOI) marker in the end kilobyte of the buffer.
+  if (sample_size > kBackSearchSize) {
+    if (ScanEOI(sample + sample_size - kBackSearchSize, kBackSearchSize)) {
+      return LIBYUV_TRUE;  // Success: Valid jpeg.
     }
-    --i;
+    // Reduce search size for forward search.
+    sample_size = sample_size - kBackSearchSize + 1;
   }
-  // ERROR: Invalid jpeg end code not found. Size sample_size
-  return LIBYUV_FALSE;
+  return ScanEOI(sample, sample_size);
+
 }
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc
index 3857008cae3..75ef775dde8 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/planar_functions.cc
@@ -41,16 +41,14 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
   if (src_y == dst_y && src_stride_y == dst_stride_y) {
     return;
   }
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
   }
 #endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -59,8 +57,8 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
 #if defined(HAS_COPYROW_MIPS)
@@ -90,15 +88,8 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
     height = 1;
     src_stride_y = dst_stride_y = 0;
   }
-#if defined(HAS_COPYROW_16_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_16_X86;
-  }
-#endif
 #if defined(HAS_COPYROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) {
     CopyRow = CopyRow_16_SSE2;
   }
 #endif
@@ -239,25 +230,43 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
     src_stride_y = -src_stride_y;
   }
 #if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    MirrorRow = MirrorRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    MirrorRow = MirrorRow_AVX2;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
+    MirrorRow = MirrorRow_MIPS_DSPR2;
   }
 #endif
 
@@ -298,23 +307,17 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_YUY2TOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
     YUY2ToYRow = YUY2ToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
-      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
-        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          YUY2ToYRow = YUY2ToYRow_SSE2;
-        }
-      }
+      YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+      YUY2ToYRow = YUY2ToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
     YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -324,7 +327,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
   }
 #endif
 #if defined(HAS_YUY2TOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
     if (width >= 16) {
       YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
@@ -376,23 +379,17 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_UYVYTOYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
     UYVYToYRow = UYVYToYRow_Any_SSE2;
     if (IS_ALIGNED(width, 16)) {
-      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
-      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
-        UYVYToUV422Row = UYVYToUV422Row_SSE2;
-        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-          UYVYToYRow = UYVYToYRow_SSE2;
-        }
-      }
+      UYVYToUV422Row = UYVYToUV422Row_SSE2;
+      UYVYToYRow = UYVYToYRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
     UYVYToYRow = UYVYToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -402,7 +399,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
   }
 #endif
 #if defined(HAS_UYVYTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
     if (width >= 16) {
       UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
@@ -497,22 +494,28 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+#if defined(HAS_ARGBMIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
   }
 #endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    ARGBMirrorRow = ARGBMirrorRow_AVX2;
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
   }
 #endif
-#if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
-    ARGBMirrorRow = ARGBMirrorRow_NEON;
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
   }
 #endif
 
@@ -614,7 +617,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBMULTIPLYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
@@ -622,7 +625,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@@ -630,7 +633,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBMULTIPLYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBMultiplyRow = ARGBMultiplyRow_NEON;
@@ -680,7 +683,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAddRow = ARGBAddRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBAddRow = ARGBAddRow_SSE2;
@@ -688,7 +691,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBADDROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBAddRow = ARGBAddRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBAddRow = ARGBAddRow_AVX2;
@@ -696,7 +699,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBADDROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBAddRow = ARGBAddRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBAddRow = ARGBAddRow_NEON;
@@ -741,7 +744,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSUBTRACTROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBSubtractRow = ARGBSubtractRow_SSE2;
@@ -749,7 +752,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBSubtractRow = ARGBSubtractRow_AVX2;
@@ -757,7 +760,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBSUBTRACTROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBSubtractRow = ARGBSubtractRow_NEON;
@@ -808,24 +811,31 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
   }
-#if defined(HAS_I422TOBGRAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I422ToBGRARow = I422ToBGRARow_Any_NEON;
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToBGRARow = I422ToBGRARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOBGRAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToBGRARow = I422ToBGRARow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I422ToBGRARow = I422ToBGRARow_NEON;
+      I422ToBGRARow = I422ToBGRARow_AVX2;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
-    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+#endif
+#if defined(HAS_I422TOBGRAROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToBGRARow = I422ToBGRARow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
-        I422ToBGRARow = I422ToBGRARow_SSSE3;
-      }
+      I422ToBGRARow = I422ToBGRARow_NEON;
     }
   }
-#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
       IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
       IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
@@ -879,20 +889,26 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
     src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
   }
 #if defined(HAS_I422TOABGRROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     I422ToABGRRow = I422ToABGRRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
       I422ToABGRRow = I422ToABGRRow_NEON;
     }
   }
-#elif defined(HAS_I422TOABGRROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+#endif
+#if defined(HAS_I422TOABGRROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
-        I422ToABGRRow = I422ToABGRRow_SSSE3;
-      }
+      I422ToABGRRow = I422ToABGRRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOABGRROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToABGRRow = I422ToABGRRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToABGRRow = I422ToABGRRow_AVX2;
     }
   }
 #endif
@@ -941,20 +957,26 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
     src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
   }
 #if defined(HAS_I422TORGBAROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
     I422ToRGBARow = I422ToRGBARow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
       I422ToRGBARow = I422ToRGBARow_NEON;
     }
   }
-#elif defined(HAS_I422TORGBAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+#endif
+#if defined(HAS_I422TORGBAROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
-        I422ToRGBARow = I422ToRGBARow_SSSE3;
-      }
+      I422ToRGBARow = I422ToRGBARow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGBAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGBARow = I422ToRGBARow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGBARow = I422ToRGBARow_AVX2;
     }
   }
 #endif
@@ -991,14 +1013,23 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_NV12TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
     }
   }
-#elif defined(HAS_NV12TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV12ToRGB565Row = NV12ToRGB565Row_NEON;
@@ -1039,14 +1070,23 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y,
     dst_stride_rgb565 = -dst_stride_rgb565;
   }
 #if defined(HAS_NV21TORGB565ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
     }
   }
-#elif defined(HAS_NV21TORGB565ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+#endif
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB565Row = NV21ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB565Row = NV21ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
     NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       NV21ToRGB565Row = NV21ToRGB565Row_NEON;
@@ -1070,8 +1110,12 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
               int width, int height,
               uint32 value) {
   int y;
-  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
-  void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
+  void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
   // Coalesce rows.
   if (dst_stride_y == width) {
     width *= height;
@@ -1079,21 +1123,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
     dst_stride_y = 0;
   }
 #if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
-    SetRow = SetRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SetRow = SetRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SetRow = SetRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_SETROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    SetRow = SetRow_X86;
+  if (TestCpuFlag(kCpuHasX86)) {
+    SetRow = SetRow_Any_X86;
+    if (IS_ALIGNED(width, 4)) {
+      SetRow = SetRow_X86;
+    }
+  }
+#endif
+#if defined(HAS_SETROW_ERMS)
+  if (TestCpuFlag(kCpuHasERMS)) {
+    SetRow = SetRow_ERMS;
   }
 #endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
-    SetRow(dst_y, v32, width);
+    SetRow(dst_y, value, width);
     dst_y += dst_stride_y;
   }
 }
@@ -1112,7 +1165,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
   uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
   uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
   if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height <= 0 ||
+      width <= 0 || height == 0 ||
       x < 0 || y < 0 ||
       value_y < 0 || value_y > 255 ||
       value_u < 0 || value_u > 255 ||
@@ -1132,11 +1185,18 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
              int dst_x, int dst_y,
              int width, int height,
              uint32 value) {
+  int y;
+  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
   if (!dst_argb ||
-      width <= 0 || height <= 0 ||
+      width <= 0 || height == 0 ||
       dst_x < 0 || dst_y < 0) {
     return -1;
   }
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
   dst_argb += dst_y * dst_stride_argb + dst_x * 4;
   // Coalesce rows.
   if (dst_stride_argb == width * 4) {
@@ -1144,20 +1204,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
     height = 1;
     dst_stride_argb = 0;
   }
-#if defined(HAS_SETROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
-    return 0;
+
+#if defined(HAS_ARGBSETROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBSetRow = ARGBSetRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_NEON;
+    }
   }
 #endif
-#if defined(HAS_SETROW_X86)
+#if defined(HAS_ARGBSETROW_X86)
   if (TestCpuFlag(kCpuHasX86)) {
-    ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
-    return 0;
+    ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
-  ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
+
+  // Set plane
+  for (y = 0; y < height; ++y) {
+    ARGBSetRow(dst_argb, value, width);
+    dst_argb += dst_stride_argb;
+  }
   return 0;
 }
 
@@ -1197,9 +1263,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
@@ -1207,7 +1271,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4)) {
       ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@@ -1215,7 +1279,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@@ -1223,7 +1287,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBAttenuateRow = ARGBAttenuateRow_NEON;
@@ -1263,7 +1327,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
@@ -1271,7 +1335,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@@ -1312,12 +1376,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_SSSE3;
   }
-#elif defined(HAS_ARGBGRAYROW_NEON)
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
@@ -1350,11 +1413,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
     dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBGRAYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_SSSE3;
   }
-#elif defined(HAS_ARGBGRAYROW_NEON)
+#endif
+#if defined(HAS_ARGBGRAYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
@@ -1383,11 +1446,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
     dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSEPIAROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBSepiaRow = ARGBSepiaRow_SSSE3;
   }
-#elif defined(HAS_ARGBSEPIAROW_NEON)
+#endif
+#if defined(HAS_ARGBSEPIAROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBSepiaRow = ARGBSepiaRow_NEON;
   }
@@ -1425,11 +1488,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
   }
-#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
@@ -1568,11 +1631,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
     dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBQUANTIZEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
     ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
   }
-#elif defined(HAS_ARGBQUANTIZEROW_NEON)
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBQuantizeRow = ARGBQuantizeRow_NEON;
   }
@@ -1743,12 +1806,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSHADEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) {
     ARGBShadeRow = ARGBShadeRow_SSE2;
   }
-#elif defined(HAS_ARGBSHADEROW_NEON)
+#endif
+#if defined(HAS_ARGBSHADEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBShadeRow = ARGBShadeRow_NEON;
   }
@@ -1790,33 +1852,23 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
     src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
-          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -1824,19 +1876,19 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(width, 4)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
       IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
   }
 #endif
 
@@ -1876,7 +1928,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     src_stride_bgra = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBSHUFFLEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBShuffleRow = ARGBShuffleRow_SSE2;
@@ -1884,19 +1936,15 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
   }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        ARGBShuffleRow = ARGBShuffleRow_SSSE3;
-      }
+      ARGBShuffleRow = ARGBShuffleRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
       ARGBShuffleRow = ARGBShuffleRow_AVX2;
@@ -1904,7 +1952,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
   }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
     if (IS_ALIGNED(width, 4)) {
       ARGBShuffleRow = ARGBShuffleRow_NEON;
@@ -1947,8 +1995,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
   }
   // ARGBToBayer used to select G channel from ARGB.
 #if defined(HAS_ARGBTOBAYERGGROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
     if (IS_ALIGNED(width, 8)) {
       ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
@@ -1956,8 +2003,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       ARGBToBayerRow = ARGBToBayerRow_SSSE3;
@@ -1965,7 +2011,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOBAYERGGROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
       ARGBToBayerRow = ARGBToBayerGGRow_NEON;
@@ -2048,8 +2094,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
   void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
     SobelRow = SobelRow_SSE2;
   }
 #endif
@@ -2070,8 +2115,7 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
   void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
                           uint8* dst_, int width) = SobelToPlaneRow_C;
 #if defined(HAS_SOBELTOPLANEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
     SobelToPlaneRow = SobelToPlaneRow_SSE2;
   }
 #endif
@@ -2093,8 +2137,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
   void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
     SobelXYRow = SobelXYRow_SSE2;
   }
 #endif
@@ -2218,10 +2261,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOPYALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
-      IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
     ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
   }
 #endif
@@ -2264,10 +2304,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
     src_stride_y = dst_stride_argb = 0;
   }
 #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
-      IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
     ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
   }
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc
index 2ef3228cb80..5acaccfd89d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate.cc
@@ -42,11 +42,7 @@ extern "C" {
 #endif
 
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_MIRRORROW_NEON
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-#define HAS_MIRRORROW_UV_NEON
-void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSE_WX8_NEON
 void TransposeWx8_NEON(const uint8* src, int src_stride,
                        uint8* dst, int dst_stride, int width);
@@ -55,7 +51,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                          uint8* dst_a, int dst_stride_a,
                          uint8* dst_b, int dst_stride_b,
                          int width);
-#endif  // defined(__ARM_NEON__)
+#endif
 
 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
     defined(__mips__) && \
@@ -194,31 +190,31 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
  convertloop:
     // Read in the data from the source pointer.
     // First round of bit swap.
-    movdqa    xmm0, [eax]
-    movdqa    xmm1, [eax + edi]
+    movdqu    xmm0, [eax]
+    movdqu    xmm1, [eax + edi]
     lea       eax, [eax + 2 * edi]
     movdqa    xmm7, xmm0  // use xmm7 as temp register.
     punpcklbw xmm0, xmm1
     punpckhbw xmm7, xmm1
     movdqa    xmm1, xmm7
-    movdqa    xmm2, [eax]
-    movdqa    xmm3, [eax + edi]
+    movdqu    xmm2, [eax]
+    movdqu    xmm3, [eax + edi]
     lea       eax, [eax + 2 * edi]
     movdqa    xmm7, xmm2
     punpcklbw xmm2, xmm3
     punpckhbw xmm7, xmm3
     movdqa    xmm3, xmm7
-    movdqa    xmm4, [eax]
-    movdqa    xmm5, [eax + edi]
+    movdqu    xmm4, [eax]
+    movdqu    xmm5, [eax + edi]
     lea       eax, [eax + 2 * edi]
     movdqa    xmm7, xmm4
     punpcklbw xmm4, xmm5
     punpckhbw xmm7, xmm5
     movdqa    xmm5, xmm7
-    movdqa    xmm6, [eax]
-    movdqa    xmm7, [eax + edi]
+    movdqu    xmm6, [eax]
+    movdqu    xmm7, [eax + edi]
     lea       eax, [eax + 2 * edi]
-    movdqa    [esp], xmm5  // backup xmm5
+    movdqu    [esp], xmm5  // backup xmm5
     neg       edi
     movdqa    xmm5, xmm6   // use xmm5 as temp register.
     punpcklbw xmm6, xmm7
@@ -239,8 +235,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     punpcklwd xmm4, xmm6
     punpckhwd xmm5, xmm6
     movdqa    xmm6, xmm5
-    movdqa    xmm5, [esp]  // restore xmm5
-    movdqa    [esp], xmm6  // backup xmm6
+    movdqu    xmm5, [esp]  // restore xmm5
+    movdqu    [esp], xmm6  // backup xmm6
     movdqa    xmm6, xmm5    // use xmm6 as temp register.
     punpcklwd xmm5, xmm7
     punpckhwd xmm6, xmm7
@@ -251,7 +247,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     punpckldq xmm0, xmm4
     punpckhdq xmm6, xmm4
     movdqa    xmm4, xmm6
-    movdqa    xmm6, [esp]  // restore xmm6
+    movdqu    xmm6, [esp]  // restore xmm6
     movlpd    qword ptr [edx], xmm0
     movhpd    qword ptr [ebx], xmm0
     movlpd    qword ptr [edx + esi], xmm4
@@ -296,7 +292,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     ret
   }
 }
-#elif !defined(LIBYUV_DISABLE_X86) && \
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
 #define HAS_TRANSPOSE_WX8_SSSE3
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
@@ -379,10 +376,8 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
       "+r"(width)   // %2
     : "r"((intptr_t)(src_stride)),  // %3
       "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc"
-  #if defined(__SSE2__)
-      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  #endif
+    : "memory", "cc",
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -411,31 +406,31 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     "mov    0x2c(%ecx),%ecx                    \n"
 
 "1:                                            \n"
-    "movdqa (%eax),%xmm0                       \n"
-    "movdqa (%eax,%edi,1),%xmm1                \n"
+    "movdqu (%eax),%xmm0                       \n"
+    "movdqu (%eax,%edi,1),%xmm1                \n"
     "lea    (%eax,%edi,2),%eax                 \n"
     "movdqa %xmm0,%xmm7                        \n"
     "punpcklbw %xmm1,%xmm0                     \n"
     "punpckhbw %xmm1,%xmm7                     \n"
     "movdqa %xmm7,%xmm1                        \n"
-    "movdqa (%eax),%xmm2                       \n"
-    "movdqa (%eax,%edi,1),%xmm3                \n"
+    "movdqu (%eax),%xmm2                       \n"
+    "movdqu (%eax,%edi,1),%xmm3                \n"
     "lea    (%eax,%edi,2),%eax                 \n"
     "movdqa %xmm2,%xmm7                        \n"
     "punpcklbw %xmm3,%xmm2                     \n"
     "punpckhbw %xmm3,%xmm7                     \n"
     "movdqa %xmm7,%xmm3                        \n"
-    "movdqa (%eax),%xmm4                       \n"
-    "movdqa (%eax,%edi,1),%xmm5                \n"
+    "movdqu (%eax),%xmm4                       \n"
+    "movdqu (%eax,%edi,1),%xmm5                \n"
     "lea    (%eax,%edi,2),%eax                 \n"
     "movdqa %xmm4,%xmm7                        \n"
     "punpcklbw %xmm5,%xmm4                     \n"
     "punpckhbw %xmm5,%xmm7                     \n"
     "movdqa %xmm7,%xmm5                        \n"
-    "movdqa (%eax),%xmm6                       \n"
-    "movdqa (%eax,%edi,1),%xmm7                \n"
+    "movdqu (%eax),%xmm6                       \n"
+    "movdqu (%eax,%edi,1),%xmm7                \n"
     "lea    (%eax,%edi,2),%eax                 \n"
-    "movdqa %xmm5,(%esp)                       \n"
+    "movdqu %xmm5,(%esp)                       \n"
     "neg    %edi                               \n"
     "movdqa %xmm6,%xmm5                        \n"
     "punpcklbw %xmm7,%xmm6                     \n"
@@ -455,8 +450,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     "punpcklwd %xmm6,%xmm4                     \n"
     "punpckhwd %xmm6,%xmm5                     \n"
     "movdqa %xmm5,%xmm6                        \n"
-    "movdqa (%esp),%xmm5                       \n"
-    "movdqa %xmm6,(%esp)                       \n"
+    "movdqu (%esp),%xmm5                       \n"
+    "movdqu %xmm6,(%esp)                       \n"
     "movdqa %xmm5,%xmm6                        \n"
     "punpcklwd %xmm7,%xmm5                     \n"
     "punpckhwd %xmm7,%xmm6                     \n"
@@ -465,7 +460,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     "punpckldq %xmm4,%xmm0                     \n"
     "punpckhdq %xmm4,%xmm6                     \n"
     "movdqa %xmm6,%xmm4                        \n"
-    "movdqa (%esp),%xmm6                       \n"
+    "movdqu (%esp),%xmm6                       \n"
     "movlpd %xmm0,(%edx)                       \n"
     "movhpd %xmm0,(%ebx)                       \n"
     "movlpd %xmm4,(%edx,%esi,1)                \n"
@@ -514,7 +509,8 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     "ret                                       \n"
 #endif
 );
-#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
     defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
@@ -525,38 +521,38 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
   // First round of bit swap.
   ".p2align  2                                 \n"
 "1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "movdqa     (%0,%3),%%xmm1                   \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%3),%%xmm1                   \n"
   "lea        (%0,%3,2),%0                     \n"
   "movdqa     %%xmm0,%%xmm8                    \n"
   "punpcklbw  %%xmm1,%%xmm0                    \n"
   "punpckhbw  %%xmm1,%%xmm8                    \n"
-  "movdqa     (%0),%%xmm2                      \n"
+  "movdqu     (%0),%%xmm2                      \n"
   "movdqa     %%xmm0,%%xmm1                    \n"
   "movdqa     %%xmm8,%%xmm9                    \n"
   "palignr    $0x8,%%xmm1,%%xmm1               \n"
   "palignr    $0x8,%%xmm9,%%xmm9               \n"
-  "movdqa     (%0,%3),%%xmm3                   \n"
+  "movdqu     (%0,%3),%%xmm3                   \n"
   "lea        (%0,%3,2),%0                     \n"
   "movdqa     %%xmm2,%%xmm10                   \n"
   "punpcklbw  %%xmm3,%%xmm2                    \n"
   "punpckhbw  %%xmm3,%%xmm10                   \n"
   "movdqa     %%xmm2,%%xmm3                    \n"
   "movdqa     %%xmm10,%%xmm11                  \n"
-  "movdqa     (%0),%%xmm4                      \n"
+  "movdqu     (%0),%%xmm4                      \n"
   "palignr    $0x8,%%xmm3,%%xmm3               \n"
   "palignr    $0x8,%%xmm11,%%xmm11             \n"
-  "movdqa     (%0,%3),%%xmm5                   \n"
+  "movdqu     (%0,%3),%%xmm5                   \n"
   "lea        (%0,%3,2),%0                     \n"
   "movdqa     %%xmm4,%%xmm12                   \n"
   "punpcklbw  %%xmm5,%%xmm4                    \n"
   "punpckhbw  %%xmm5,%%xmm12                   \n"
   "movdqa     %%xmm4,%%xmm5                    \n"
   "movdqa     %%xmm12,%%xmm13                  \n"
-  "movdqa     (%0),%%xmm6                      \n"
+  "movdqu     (%0),%%xmm6                      \n"
   "palignr    $0x8,%%xmm5,%%xmm5               \n"
   "palignr    $0x8,%%xmm13,%%xmm13             \n"
-  "movdqa     (%0,%3),%%xmm7                   \n"
+  "movdqu     (%0,%3),%%xmm7                   \n"
   "lea        (%0,%3,2),%0                     \n"
   "movdqa     %%xmm6,%%xmm14                   \n"
   "punpcklbw  %%xmm7,%%xmm6                    \n"
@@ -666,29 +662,29 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   // First round of bit swap.
   ".p2align  2                                 \n"
 "1:                                            \n"
-  "movdqa     (%0),%%xmm0                      \n"
-  "movdqa     (%0,%4),%%xmm1                   \n"
+  "movdqu     (%0),%%xmm0                      \n"
+  "movdqu     (%0,%4),%%xmm1                   \n"
   "lea        (%0,%4,2),%0                     \n"
   "movdqa     %%xmm0,%%xmm8                    \n"
   "punpcklbw  %%xmm1,%%xmm0                    \n"
   "punpckhbw  %%xmm1,%%xmm8                    \n"
   "movdqa     %%xmm8,%%xmm1                    \n"
-  "movdqa     (%0),%%xmm2                      \n"
-  "movdqa     (%0,%4),%%xmm3                   \n"
+  "movdqu     (%0),%%xmm2                      \n"
+  "movdqu     (%0,%4),%%xmm3                   \n"
   "lea        (%0,%4,2),%0                     \n"
   "movdqa     %%xmm2,%%xmm8                    \n"
   "punpcklbw  %%xmm3,%%xmm2                    \n"
   "punpckhbw  %%xmm3,%%xmm8                    \n"
   "movdqa     %%xmm8,%%xmm3                    \n"
-  "movdqa     (%0),%%xmm4                      \n"
-  "movdqa     (%0,%4),%%xmm5                   \n"
+  "movdqu     (%0),%%xmm4                      \n"
+  "movdqu     (%0,%4),%%xmm5                   \n"
   "lea        (%0,%4,2),%0                     \n"
   "movdqa     %%xmm4,%%xmm8                    \n"
   "punpcklbw  %%xmm5,%%xmm4                    \n"
   "punpckhbw  %%xmm5,%%xmm8                    \n"
   "movdqa     %%xmm8,%%xmm5                    \n"
-  "movdqa     (%0),%%xmm6                      \n"
-  "movdqa     (%0,%4),%%xmm7                   \n"
+  "movdqu     (%0),%%xmm6                      \n"
+  "movdqu     (%0,%4),%%xmm7                   \n"
   "lea        (%0,%4,2),%0                     \n"
   "movdqa     %%xmm6,%%xmm8                    \n"
   "punpcklbw  %%xmm7,%%xmm6                    \n"
@@ -818,9 +814,7 @@ void TransposePlane(const uint8* src, int src_stride,
   }
 #endif
 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
     TransposeWx8 = TransposeWx8_FAST_SSSE3;
   }
 #endif
@@ -883,29 +877,38 @@ void RotatePlane180(const uint8* src, int src_stride,
   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
-    MirrorRow = MirrorRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_NEON;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    MirrorRow = MirrorRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MirrorRow = MirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    MirrorRow = MirrorRow_SSSE3;
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    MirrorRow = MirrorRow_AVX2;
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
   }
 #endif
+// TODO(fbarchard): Mirror on mips handle unaligned memory.
 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
@@ -913,21 +916,14 @@ void RotatePlane180(const uint8* src, int src_stride,
     MirrorRow = MirrorRow_MIPS_DSPR2;
   }
 #endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_COPYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
   }
 #endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -935,6 +931,11 @@ void RotatePlane180(const uint8* src, int src_stride,
     CopyRow = CopyRow_ERMS;
   }
 #endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
 #if defined(HAS_COPYROW_MIPS)
   if (TestCpuFlag(kCpuHasMIPS)) {
     CopyRow = CopyRow_MIPS;
@@ -1010,13 +1011,13 @@ void TransposeUV(const uint8* src, int src_stride,
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
   }
-#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(width, 8) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) {
     TransposeUVWx8 = TransposeUVWx8_SSE2;
   }
-#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+#endif
+#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
@@ -1084,12 +1085,13 @@ void RotateUV180(const uint8* src, int src_stride,
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     MirrorRowUV = MirrorUVRow_NEON;
   }
-#elif defined(HAS_MIRRORROW_UV_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_MIRRORROW_UV_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
     MirrorRowUV = MirrorUVRow_SSSE3;
   }
-#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+#endif
+#if defined(HAS_MIRRORUVROW_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc
index ab0f9ce0707..b9673db15d3 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_argb.cc
@@ -31,7 +31,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
                                uint8* dst_ptr, int dst_width);
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
                                int src_stepx,
@@ -50,13 +50,12 @@ static void ARGBTranspose(const uint8* src, int src_stride,
   void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
       int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) &&  // Width of dest.
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
   }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) &&  // Width of dest.
-      IS_ALIGNED(src, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
     ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
   }
 #endif
@@ -102,38 +101,38 @@ void ARGBRotate180(const uint8* src, int src_stride,
   void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
       ARGBMirrorRow_C;
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_ARGBMIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ARGBMirrorRow = ARGBMirrorRow_SSSE3;
-  }
-#endif
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    ARGBMirrorRow = ARGBMirrorRow_AVX2;
-  }
-#endif
 #if defined(HAS_ARGBMIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
-    ARGBMirrorRow = ARGBMirrorRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_NEON;
+    }
   }
 #endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
-    CopyRow = CopyRow_NEON;
+#if defined(HAS_ARGBMIRRORROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMirrorRow = ARGBMirrorRow_SSE2;
+    }
   }
 #endif
-#if defined(HAS_COPYROW_X86)
-  if (TestCpuFlag(kCpuHasX86)) {
-    CopyRow = CopyRow_X86;
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBMirrorRow = ARGBMirrorRow_AVX2;
+    }
   }
 #endif
 #if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
-      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
-    CopyRow = CopyRow_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
+  }
+#endif
+#if defined(HAS_COPYROW_AVX)
+  if (TestCpuFlag(kCpuHasAVX)) {
+    CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX;
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
@@ -141,6 +140,11 @@ void ARGBRotate180(const uint8* src, int src_stride,
     CopyRow = CopyRow_ERMS;
   }
 #endif
+#if defined(HAS_COPYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
+  }
+#endif
 #if defined(HAS_COPYROW_MIPS)
   if (TestCpuFlag(kCpuHasMIPS)) {
     CopyRow = CopyRow_MIPS;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc
index d354e11faa6..a23a40fee34 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon.cc
@@ -17,7 +17,8 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 static uvec8 kVTbl4x4Transpose =
   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
@@ -525,7 +526,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
       "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
   );
 }
-#endif
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon64.cc
new file mode 100644
index 00000000000..92358af7ff6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/rotate_neon64.cc
@@ -0,0 +1,543 @@
+/*
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon armv8 64 bit.
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+static uvec8 kVTbl4x4Transpose =
+  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+                       uint8* dst, int dst_stride,
+                       int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub         %3, %3, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                          \n"
+      "mov         %0, %1                        \n"
+
+      MEMACCESS(0)
+      "ld1        {v0.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v1.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v2.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v3.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v4.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v5.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v6.8b}, [%0], %5              \n"
+      MEMACCESS(0)
+      "ld1        {v7.8b}, [%0]                  \n"
+
+      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "trn1     v23.8b, v6.8b, v7.8b             \n"
+
+      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "trn1     v4.4h, v20.4h, v22.4h            \n"
+
+      "trn2     v21.2s, v1.2s, v5.2s             \n"
+      "trn1     v17.2s, v1.2s, v5.2s             \n"
+      "trn2     v20.2s, v0.2s, v4.2s             \n"
+      "trn1     v16.2s, v0.2s, v4.2s             \n"
+      "trn2     v23.2s, v3.2s, v7.2s             \n"
+      "trn1     v19.2s, v3.2s, v7.2s             \n"
+      "trn2     v22.2s, v2.2s, v6.2s             \n"
+      "trn1     v18.2s, v2.2s, v6.2s             \n"
+
+      "mov         %0, %2                        \n"
+
+    MEMACCESS(0)
+      "st1      {v17.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v16.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v19.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v18.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v21.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v20.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v23.8b}, [%0], %6               \n"
+    MEMACCESS(0)
+      "st1      {v22.8b}, [%0]                   \n"
+
+      "add         %1, %1, #8                    \n"  // src += 8
+      "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
+      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "b.ge        1b                            \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds        %3, %3, #8                      \n"
+    "b.eq        4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    "cmp         %3, #4                          \n"
+    "b.lt        2f                              \n"
+
+    // 4x8 block
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.s}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.s}[3], [%0]                     \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(4)
+    "ld1      {v2.16b}, [%4]                     \n"
+
+    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
+    MEMACCESS(0)
+    "st1 {v3.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v3.s}[3], [%0]                         \n"
+
+    "add         %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[0], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[1], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[2], [%0], %6                     \n"
+    MEMACCESS(0)
+    "st1 {v0.s}[3], [%0]                         \n"
+
+    "add         %1, %1, #4                      \n"  // src += 4
+    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+    "subs        %3, %3, #4                      \n"  // w   -= 4
+    "b.eq        4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp         %3, #2                          \n"
+    "b.lt        3f                              \n"
+
+    // 2x8 block
+    "2:                                          \n"
+    "mov         %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[0], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[1], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[2], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v0.h}[3], [%0], %5                 \n"
+    MEMACCESS(0)
+    "ld1     {v1.h}[3], [%0]                     \n"
+
+    "trn2    v2.8b, v0.8b, v1.8b                 \n"
+    "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+    "mov         %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1     {v3.8b}, [%0], %6                   \n"
+    MEMACCESS(0)
+    "st1     {v2.8b}, [%0]                       \n"
+
+    "add         %1, %1, #2                      \n"  // src += 2
+    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+    "subs        %3, %3,  #2                     \n"  // w   -= 2
+    "b.eq        4f                              \n"
+
+    // 1x8 block
+    "3:                                          \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[0], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[1], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[2], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[3], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[4], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[5], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[6], [%1], %5             \n"
+    MEMACCESS(1)
+    "ld1         {v0.b}[7], [%1]                 \n"
+
+    MEMACCESS(2)
+    "st1         {v0.8b}, [%2]                   \n"
+
+    "4:                                          \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst),                                  // %2
+      "+r"(width64)                               // %3
+    : "r"(&kVTbl4x4Transpose),                    // %4
+      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
+    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
+  );
+}
+
+static uint8 kVTbl4x4TransposeDi[32] =
+  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
+    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+                         uint8* dst_a, int dst_stride_a,
+                         uint8* dst_b, int dst_stride_b,
+                         int width) {
+  const uint8* src_temp = NULL;
+  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
+  asm volatile (
+    // loops are on blocks of 8. loop will stop when
+    // counter gets to or below 0. starting the counter
+    // at w-8 allow for this
+    "sub       %4, %4, #8                      \n"
+
+    // handle 8x8 blocks. this should be the majority of the plane
+    "1:                                        \n"
+    "mov       %0, %1                          \n"
+
+    MEMACCESS(0)
+    "ld1       {v0.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v1.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v2.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v3.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v4.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v5.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v6.16b}, [%0], %5              \n"
+    MEMACCESS(0)
+    "ld1       {v7.16b}, [%0]                  \n"
+
+    "trn1      v16.16b, v0.16b, v1.16b         \n"
+    "trn2      v17.16b, v0.16b, v1.16b         \n"
+    "trn1      v18.16b, v2.16b, v3.16b         \n"
+    "trn2      v19.16b, v2.16b, v3.16b         \n"
+    "trn1      v20.16b, v4.16b, v5.16b         \n"
+    "trn2      v21.16b, v4.16b, v5.16b         \n"
+    "trn1      v22.16b, v6.16b, v7.16b         \n"
+    "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+    "trn1      v0.8h, v16.8h, v18.8h           \n"
+    "trn2      v1.8h, v16.8h, v18.8h           \n"
+    "trn1      v2.8h, v20.8h, v22.8h           \n"
+    "trn2      v3.8h, v20.8h, v22.8h           \n"
+    "trn1      v4.8h, v17.8h, v19.8h           \n"
+    "trn2      v5.8h, v17.8h, v19.8h           \n"
+    "trn1      v6.8h, v21.8h, v23.8h           \n"
+    "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+    "trn1      v16.4s, v0.4s, v2.4s            \n"
+    "trn2      v17.4s, v0.4s, v2.4s            \n"
+    "trn1      v18.4s, v1.4s, v3.4s            \n"
+    "trn2      v19.4s, v1.4s, v3.4s            \n"
+    "trn1      v20.4s, v4.4s, v6.4s            \n"
+    "trn2      v21.4s, v4.4s, v6.4s            \n"
+    "trn1      v22.4s, v5.4s, v7.4s            \n"
+    "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v16.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v17.d}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v19.d}[1], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v20.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v20.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v22.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v21.d}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v23.d}[1], [%0]                \n"
+
+    "add       %1, %1, #16                     \n"  // src   += 8*2
+    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
+    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
+    "subs      %4, %4,  #8                     \n"  // w     -= 8
+    "b.ge      1b                              \n"
+
+    // add 8 back to counter. if the result is 0 there are
+    // no residuals.
+    "adds      %4, %4, #8                      \n"
+    "b.eq      4f                              \n"
+
+    // some residual, so between 1 and 7 lines left to transpose
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    "cmp       %4, #4                          \n"
+    "b.lt      2f                              \n"
+
+    // TODO(frkoenig): Clean this up
+    // 4x8 block
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld1       {v0.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v1.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v2.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v3.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v4.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v5.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v6.8b}, [%0], %5               \n"
+    MEMACCESS(0)
+    "ld1       {v7.8b}, [%0]                   \n"
+
+    MEMACCESS(8)
+    "ld1       {v30.16b}, [%8], #16            \n"
+    "ld1       {v31.16b}, [%8]                 \n"
+
+    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v16.s}[0],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[1],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[2],  [%0], %6           \n"
+    MEMACCESS(0)
+    "st1       {v16.s}[3],  [%0], %6           \n"
+
+    "add       %0, %2, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[0], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[1], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[2], [%0], %6            \n"
+    MEMACCESS(0)
+    "st1       {v18.s}[3], [%0]                \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v17.s}[0], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[1], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[2], [%0], %7            \n"
+    MEMACCESS(0)
+    "st1       {v17.s}[3], [%0], %7            \n"
+
+    "add       %0, %3, #4                      \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[0],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[1],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[2],  [%0], %7           \n"
+    MEMACCESS(0)
+    "st1       {v19.s}[3],  [%0]               \n"
+
+    "add       %1, %1, #8                      \n"  // src   += 4 * 2
+    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
+    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
+    "subs      %4,  %4,  #4                    \n"  // w     -= 4
+    "b.eq      4f                              \n"
+
+    // some residual, check to see if it includes a 2x8 block,
+    // or less
+    "cmp       %4, #2                          \n"
+    "b.lt      3f                              \n"
+
+    // 2x8 block
+    "2:                                        \n"
+    "mov       %0, %1                          \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+    MEMACCESS(0)
+    "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+    "trn1      v4.8b, v0.8b, v2.8b             \n"
+    "trn2      v5.8b, v0.8b, v2.8b             \n"
+    "trn1      v6.8b, v1.8b, v3.8b             \n"
+    "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+    "mov       %0, %2                          \n"
+
+    MEMACCESS(0)
+    "st1       {v4.d}[0], [%0], %6             \n"
+    MEMACCESS(0)
+    "st1       {v6.d}[0], [%0]                 \n"
+
+    "mov       %0, %3                          \n"
+
+    MEMACCESS(0)
+    "st1       {v5.d}[0], [%0], %7             \n"
+    MEMACCESS(0)
+    "st1       {v7.d}[0], [%0]                 \n"
+
+    "add       %1, %1, #4                      \n"  // src   += 2 * 2
+    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
+    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
+    "subs      %4,  %4,  #2                    \n"  // w     -= 2
+    "b.eq      4f                              \n"
+
+    // 1x8 block
+    "3:                                        \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+    MEMACCESS(1)
+    "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+    MEMACCESS(2)
+    "st1       {v0.d}[0], [%2]                 \n"
+    MEMACCESS(3)
+    "st1       {v1.d}[0], [%3]                 \n"
+
+    "4:                                        \n"
+
+    : "+r"(src_temp),                             // %0
+      "+r"(src),                                  // %1
+      "+r"(dst_a),                                // %2
+      "+r"(dst_b),                                // %3
+      "+r"(width64)                               // %4
+    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+      "r"(&kVTbl4x4TransposeDi)                   // %8
+    : "memory", "cc",
+      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v30", "v31"
+  );
+}
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc
index ce8b3dad119..19340b3b74c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_any.cc
@@ -17,17 +17,14 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
-// TODO(fbarchard): Consider 'any' functions handling odd alignment.
 // YUV to RGB does multiple of 8 with SIMD and remainder with C.
 #define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK)        \
-    void NAMEANY(const uint8* y_buf,                                           \
-                 const uint8* u_buf,                                           \
-                 const uint8* v_buf,                                           \
-                 uint8* rgb_buf,                                               \
-                 int width) {                                                  \
+    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
+                 uint8* rgb_buf, int width) {                                  \
       int n = width & ~MASK;                                                   \
-      I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n);                         \
+      if (n > 0) {                                                             \
+        I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n);                       \
+      }                                                                        \
       I420TORGB_C(y_buf + n,                                                   \
                   u_buf + (n >> UV_SHIFT),                                     \
                   v_buf + (n >> UV_SHIFT),                                     \
@@ -35,36 +32,59 @@ extern "C" {
     }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C,
      1, 4, 7)
-#endif  // HAS_I422TOARGBROW_SSSE3
+#endif
 #ifdef HAS_I444TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C,
      0, 4, 7)
-YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C,
      2, 4, 7)
-YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C,
      1, 4, 7)
-YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C,
      1, 4, 7)
-YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C,
      1, 4, 7)
-// I422ToRGB565Row_SSSE3 is unaligned.
 YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
      1, 2, 7)
 YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
      1, 2, 7)
 YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
      1, 2, 7)
-// I422ToRGB24Row_SSSE3 is unaligned.
 YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
 YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
 YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
 YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
 #endif  // HAS_I444TOARGBROW_SSSE3
+#ifdef HAS_J422TOARGBROW_SSSE3
+YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C,
+     1, 4, 7)
+#endif
 #ifdef HAS_I422TOARGBROW_AVX2
 YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
-#endif  // HAS_I422TOARGBROW_AVX2
+#endif
+#ifdef HAS_I422TOBGRAROW_AVX2
+YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15)
+#endif
+#ifdef HAS_I422TORGBAROW_AVX2
+YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOABGRROW_AVX2
+YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15)
+#endif
+#ifdef HAS_I422TOARGB4444ROW_AVX2
+YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C,
+     1, 2, 7)
+#endif
+#ifdef HAS_I422TOARGB1555ROW_AVX2
+YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C,
+     1, 2, 7)
+#endif
+#ifdef HAS_I422TORGB565ROW_AVX2
+YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C,
+     1, 2, 7)
+#endif
 #ifdef HAS_I422TOARGBROW_NEON
 YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
 YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
@@ -79,214 +99,240 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
 YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
      1, 2, 7)
 YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
-#endif  // HAS_I422TOARGBROW_NEON
+#endif
 #ifdef HAS_I422TOYUY2ROW_NEON
 YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
-#endif  // HAS_I422TOYUY2ROW_NEON
+#endif
 #ifdef HAS_I422TOUYVYROW_NEON
 YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
-#endif  // HAS_I422TOUYVYROW_NEON
+#endif
 #undef YANY
 
 // Wrappers to handle odd width
-#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP)             \
-    void NAMEANY(const uint8* y_buf,                                           \
-                 const uint8* uv_buf,                                          \
-                 uint8* rgb_buf,                                               \
-                 int width) {                                                  \
-      int n = width & ~7;                                                      \
-      NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n);                               \
+#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP, MASK)       \
+    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
+                 uint8* rgb_buf, int width) {                                  \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n);                             \
+      }                                                                        \
       NV12TORGB_C(y_buf + n,                                                   \
                   uv_buf + (n >> UV_SHIFT),                                    \
-                  rgb_buf + n * BPP, width & 7);                               \
+                  rgb_buf + n * BPP, width & MASK);                            \
     }
 
 #ifdef HAS_NV12TOARGBROW_SSSE3
-NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
-      0, 4)
-NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
-      0, 4)
-#endif  // HAS_NV12TOARGBROW_SSSE3
+NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7)
+NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7)
+#endif
+#ifdef HAS_NV12TOARGBROW_AVX2
+NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15)
+NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15)
+#endif
 #ifdef HAS_NV12TOARGBROW_NEON
-NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
-NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
-#endif  // HAS_NV12TOARGBROW_NEON
+NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7)
+NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7)
+#endif
 #ifdef HAS_NV12TORGB565ROW_SSSE3
 NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
-      0, 2)
+      0, 2, 7)
 NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
-      0, 2)
-#endif  // HAS_NV12TORGB565ROW_SSSE3
+      0, 2, 7)
+#endif
+#ifdef HAS_NV12TORGB565ROW_AVX2
+NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C,
+      0, 2, 15)
+NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C,
+      0, 2, 15)
+#endif
 #ifdef HAS_NV12TORGB565ROW_NEON
-NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
-NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
-#endif  // HAS_NV12TORGB565ROW_NEON
+NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C,
+      0, 2, 7)
+NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C,
+      0, 2, 7)
+#endif
 #undef NVANY
 
-#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)          \
-    void NAMEANY(const uint8* src,                                             \
-                 uint8* dst,                                                   \
-                 int width) {                                                  \
+#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK)          \
+    void NAMEANY(const uint8* src, uint8* dst, int width) {                    \
       int n = width & ~MASK;                                                   \
-      ARGBTORGB_SIMD(src, dst, n);                                             \
+      if (n > 0) {                                                             \
+        ARGBTORGB_SIMD(src, dst, n);                                           \
+      }                                                                        \
       ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK);                \
     }
 
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
 RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
-       15, 4, 3)
+       4, 3, 15)
 RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
-       15, 4, 3)
+       4, 3, 15)
 RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
-       3, 4, 2)
+       4, 2, 3)
 RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
-       3, 4, 2)
+       4, 2, 3)
 RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
-       3, 4, 2)
+       4, 2, 3)
+#endif
+#if defined(HAS_ARGBTOARGB4444ROW_AVX2)
+RGBANY(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, ARGBToRGB565Row_C,
+       4, 2, 7)
+RGBANY(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, ARGBToARGB1555Row_C,
+       4, 2, 7)
+RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C,
+       4, 2, 7)
 #endif
+
 #if defined(HAS_I400TOARGBROW_SSE2)
-RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
-       7, 1, 4)
+RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7)
 #endif
 #if defined(HAS_YTOARGBROW_SSE2)
-RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
-       7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
-       15, 2, 4)
-RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
-       15, 2, 4)
-// These require alignment on ARGB, so C is used for remainder.
+RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7)
+#endif
+#if defined(HAS_YTOARGBROW_AVX2)
+RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C, 1, 4, 15)
+#endif
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 2, 4, 15)
+RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C, 2, 4, 15)
 RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
-       15, 3, 4)
-RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
-       15, 3, 4)
+       3, 4, 15)
+RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, 3, 4, 15)
 RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
-       7, 2, 4)
+       2, 4, 7)
 RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
-       7, 2, 4)
+       2, 4, 7)
 RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
-       7, 2, 4)
+       2, 4, 7)
+#endif
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31)
+RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31)
 #endif
 #if defined(HAS_ARGBTORGB24ROW_NEON)
-RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
-RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 4, 3, 7)
+RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 4, 3, 7)
 RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
-       7, 4, 2)
+       4, 2, 7)
 RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
-       7, 4, 2)
+       4, 2, 7)
 RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
-       7, 4, 2)
-RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
-       7, 1, 4)
-RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
-       7, 1, 4)
-RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
-       7, 2, 4)
-RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
-       7, 2, 4)
+       4, 2, 7)
+RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, 1, 4, 7)
+RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, 1, 4, 7)
+RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, 2, 4, 7)
+RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7)
 #endif
 #undef RGBANY
 
 // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
-#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)        \
-    void NAMEANY(const uint8* src,                                             \
-                 uint8* dst, uint32 selector,                                  \
-                 int width) {                                                  \
+#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK)        \
+    void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) {   \
       int n = width & ~MASK;                                                   \
-      ARGBTORGB_SIMD(src, dst, selector, n);                                   \
+      if (n > 0) {                                                             \
+        ARGBTORGB_SIMD(src, dst, selector, n);                                 \
+      }                                                                        \
       ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK);      \
     }
 
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
-         7, 4, 1)
-#endif
-#if defined(HAS_ARGBTOBAYERROW_NEON)
-BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
-         7, 4, 1)
-#endif
 #if defined(HAS_ARGBTOBAYERGGROW_SSE2)
 BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
-         7, 4, 1)
+         4, 1, 7)
 #endif
 #if defined(HAS_ARGBTOBAYERGGROW_NEON)
 BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
-         7, 4, 1)
+         4, 1, 7)
 #endif
 
 #undef BAYERANY
 
-// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
-#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM)                            \
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
     void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
-      ARGBTOY_SIMD(src_argb, dst_y, width - NUM);                              \
-      ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP,                            \
-                   dst_y + (width - NUM) * BPP, NUM);                          \
+      int n = width & ~MASK;                                                   \
+      if (n > 0) {                                                             \
+        ARGBTOY_SIMD(src_argb, dst_y, n);                                      \
+      }                                                                        \
+      ARGBTOY_C(src_argb + n * SBPP,                                           \
+                dst_y  + n * BPP, width & MASK);                               \
     }
-
 #ifdef HAS_ARGBTOYROW_AVX2
-YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
-YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
-YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
-YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
+YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, ARGBToYRow_C, 4, 1, 31)
+#endif
+#ifdef HAS_ARGBTOYJROW_AVX2
+YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, ARGBToYJRow_C, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_AVX2
+YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, UYVYToYRow_C, 2, 1, 31)
+#endif
+#ifdef HAS_YUY2TOYROW_AVX2
+YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_C, 2, 1, 31)
 #endif
 #ifdef HAS_ARGBTOYROW_SSSE3
-YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, ARGBToYRow_C, 4, 1, 15)
 #endif
 #ifdef HAS_BGRATOYROW_SSSE3
-YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
-YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
-YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, BGRAToYRow_C, 4, 1, 15)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, ABGRToYRow_C, 4, 1, 15)
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, RGBAToYRow_C, 4, 1, 15)
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, YUY2ToYRow_C, 2, 1, 15)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, UYVYToYRow_C, 2, 1, 15)
 #endif
 #ifdef HAS_ARGBTOYJROW_SSSE3
-YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, ARGBToYJRow_C, 4, 1, 15)
 #endif
 #ifdef HAS_ARGBTOYROW_NEON
-YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
-YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
-YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
-YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
-YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
-YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
-YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
-YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
-YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
-YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, ARGBToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_ARGBTOYJROW_NEON
+YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, ARGBToYJRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_BGRATOYROW_NEON
+YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, BGRAToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_ABGRTOYROW_NEON
+YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, ABGRToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_RGBATOYROW_NEON
+YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, RGBAToYRow_C, 4, 1, 7)
+#endif
+#ifdef HAS_RGB24TOYROW_NEON
+YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, RGB24ToYRow_C, 3, 1, 7)
+#endif
+#ifdef HAS_RAWTOYROW_NEON
+YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, RAWToYRow_C, 3, 1, 7)
+#endif
+#ifdef HAS_RGB565TOYROW_NEON
+YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, RGB565ToYRow_C, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB1555TOYROW_NEON
+YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, ARGB1555ToYRow_C, 2, 1, 7)
+#endif
+#ifdef HAS_ARGB4444TOYROW_NEON
+YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, ARGB4444ToYRow_C, 2, 1, 7)
 #endif
 #ifdef HAS_YUY2TOYROW_NEON
-YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, YUY2ToYRow_C, 2, 1, 15)
 #endif
 #ifdef HAS_UYVYTOYROW_NEON
-YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, UYVYToYRow_C, 2, 1, 15)
 #endif
 #ifdef HAS_RGB24TOARGBROW_NEON
-YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, RGB24ToARGBRow_C, 3, 4, 7)
 #endif
 #ifdef HAS_RAWTOARGBROW_NEON
-YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, RAWToARGBRow_C, 3, 4, 7)
 #endif
 #ifdef HAS_RGB565TOARGBROW_NEON
-YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, RGB565ToARGBRow_C, 2, 4, 7)
 #endif
 #ifdef HAS_ARGB1555TOARGBROW_NEON
-YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, ARGB1555ToARGBRow_C,
+     2, 4, 7)
 #endif
 #ifdef HAS_ARGB4444TOARGBROW_NEON
-YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, ARGB4444ToARGBRow_C,
+     2, 4, 7)
 #endif
-#undef YANY
-
-#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
-    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
-      int n = width & ~MASK;                                                   \
-      ARGBTOY_SIMD(src_argb, dst_y, n);                                        \
-      ARGBTOY_C(src_argb + n * SBPP,                                           \
-                dst_y  + n * BPP, width & MASK);                               \
-    }
-
-// Attenuate is destructive so last16 method can not be used due to overlap.
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
      4, 4, 3)
@@ -318,7 +364,9 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
     void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
                  uint8* dst_u, uint8* dst_v, int width) {                      \
       int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n);                \
+      if (n > 0) {                                                             \
+        ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n);              \
+      }                                                                        \
       ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \
                 dst_u + (n >> 1),                                              \
                 dst_v + (n >> 1),                                              \
@@ -327,29 +375,50 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15)
+UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C, 4, 15)
+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15)
+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15)
+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_YUY2TOUVROW_AVX2
 UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
 UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
 #endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
-UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
-      4, 15)
-UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
-UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
-UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
-UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
-UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
+#ifdef HAS_YUY2TOUVROW_SSE2
+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15)
+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15)
 #endif
 #ifdef HAS_ARGBTOUVROW_NEON
 UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_ARGBTOUVJROW_NEON
 UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
+#endif
+#ifdef HAS_BGRATOUVROW_NEON
 UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_ABGRTOUVROW_NEON
 UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_RGBATOUVROW_NEON
 UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
+#endif
+#ifdef HAS_RGB24TOUVROW_NEON
 UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
+#endif
+#ifdef HAS_RAWTOUVROW_NEON
 UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
+#endif
+#ifdef HAS_RGB565TOUVROW_NEON
 UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_ARGB1555TOUVROW_NEON
 UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_ARGB4444TOUVROW_NEON
 UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
 #endif
 #ifdef HAS_YUY2TOUVROW_NEON
@@ -360,11 +429,12 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
 #endif
 #undef UVANY
 
-#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT)           \
-    void NAMEANY(const uint8* src_uv,                                          \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
+#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, SHIFT, MASK)           \
+    void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \
       int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
+      if (n > 0) {                                                             \
+        ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                 \
+      }                                                                        \
       ANYTOUV_C(src_uv  + n * BPP,                                             \
                 dst_u + (n >> SHIFT),                                          \
                 dst_v + (n >> SHIFT),                                          \
@@ -372,42 +442,45 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
     }
 
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
-UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
-         ARGBToUV444Row_C, 4, 15, 0)
+UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3,
+         ARGBToUV444Row_C, 4, 0, 15)
 #endif
 #ifdef HAS_YUY2TOUV422ROW_AVX2
 UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
-         YUY2ToUV422Row_C, 2, 31, 1)
+         YUY2ToUV422Row_C, 2, 1, 31)
 UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
-         UYVYToUV422Row_C, 2, 31, 1)
+         UYVYToUV422Row_C, 2, 1, 31)
 #endif
-#ifdef HAS_ARGBTOUVROW_SSSE3
-UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
-         ARGBToUV422Row_C, 4, 15, 1)
-UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
-         YUY2ToUV422Row_C, 2, 15, 1)
-UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
-         UYVYToUV422Row_C, 2, 15, 1)
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
+UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3,
+         ARGBToUV422Row_C, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_SSE2
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2,
+         YUY2ToUV422Row_C, 2, 1, 15)
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2,
+         UYVYToUV422Row_C, 2, 1, 15)
 #endif
 #ifdef HAS_YUY2TOUV422ROW_NEON
 UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
-         ARGBToUV444Row_C, 4, 7, 0)
+         ARGBToUV444Row_C, 4, 0, 7)
 UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
-         ARGBToUV422Row_C, 4, 15, 1)
+         ARGBToUV422Row_C, 4, 1, 15)
 UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
-         ARGBToUV411Row_C, 4, 31, 2)
+         ARGBToUV411Row_C, 4, 2, 31)
 UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
-         YUY2ToUV422Row_C, 2, 15, 1)
+         YUY2ToUV422Row_C, 2, 1, 15)
 UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
-         UYVYToUV422Row_C, 2, 15, 1)
+         UYVYToUV422Row_C, 2, 1, 15)
 #endif
 #undef UV422ANY
 
 #define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                  \
-    void NAMEANY(const uint8* src_uv,                                          \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
+    void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \
       int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \
+      if (n > 0) {                                                             \
+        ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                 \
+      }                                                                        \
       ANYTOUV_C(src_uv + n * 2,                                                \
                 dst_u + n,                                                     \
                 dst_v + n,                                                     \
@@ -415,7 +488,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
     }
 
 #ifdef HAS_SPLITUVROW_SSE2
-SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
+SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15)
 #endif
 #ifdef HAS_SPLITUVROW_AVX2
 SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
@@ -424,7 +497,7 @@ SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
 SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
 #endif
 #ifdef HAS_SPLITUVROW_MIPS_DSPR2
-SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
+SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2,
               SplitUVRow_C, 15)
 #endif
 #undef SPLITUVROWANY
@@ -433,7 +506,9 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
     void NAMEANY(const uint8* src_u, const uint8* src_v,                       \
                  uint8* dst_uv, int width) {                                   \
       int n = width & ~MASK;                                                   \
-      ANYTOUV_SIMD(src_u, src_v, dst_uv, n);                                   \
+      if (n > 0) {                                                             \
+        ANYTOUV_SIMD(src_u, src_v, dst_uv, n);                                 \
+      }                                                                        \
       ANYTOUV_C(src_u + n,                                                     \
                 src_v + n,                                                     \
                 dst_uv + n * 2,                                                \
@@ -441,7 +516,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
     }
 
 #ifdef HAS_MERGEUVROW_SSE2
-MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
+MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX2
 MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
@@ -455,7 +530,9 @@ MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
     void NAMEANY(const uint8* src_argb0, const uint8* src_argb1,               \
                  uint8* dst_argb, int width) {                                 \
       int n = width & ~MASK;                                                   \
-      ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n);                        \
+      if (n > 0) {                                                             \
+        ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n);                      \
+      }                                                                        \
       ARGBMATH_C(src_argb0 + n * 4,                                            \
                  src_argb1 + n * 4,                                            \
                  dst_argb + n * 4,                                             \
@@ -502,7 +579,9 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
     void NAMEANY(const uint8* src_argb, uint8* dst_argb,                       \
                  const uint8* shuffler, int width) {                           \
       int n = width & ~MASK;                                                   \
-      ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n);                           \
+      if (n > 0) {                                                             \
+        ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n);                         \
+      }                                                                        \
       ARGBTOY_C(src_argb + n * SBPP,                                           \
                 dst_argb  + n * BPP, shuffler, width & MASK);                  \
     }
@@ -512,7 +591,7 @@ YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
      ARGBShuffleRow_C, 4, 4, 3)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
-YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
+YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3,
      ARGBShuffleRow_C, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
@@ -531,35 +610,107 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
                  ptrdiff_t src_stride_ptr, int width,                          \
                  int source_y_fraction) {                                      \
       int n = width & ~MASK;                                                   \
-      TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr,                              \
-                n, source_y_fraction);                                         \
+      if (n > 0) {                                                             \
+        TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);     \
+      }                                                                        \
       TERP_C(dst_ptr + n * BPP,                                                \
              src_ptr + n * SBPP, src_stride_ptr,                               \
              width & MASK, source_y_fraction);                                 \
     }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
-NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
-     InterpolateRow_C, 1, 1, 32)
+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 31)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
-     InterpolateRow_C, 1, 1, 15)
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
-     InterpolateRow_C, 1, 1, 15)
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON
-NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
-     InterpolateRow_C, 1, 1, 15)
+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
-NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
-     InterpolateRow_C, 1, 1, 3)
+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
+     1, 1, 3)
 #endif
 #undef NANY
 
+#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK)                        \
+    void NAMEANY(const uint8* src_y, uint8* dst_y, int width) {                \
+      int n = width & ~MASK;                                                   \
+      int r = width & MASK;                                                    \
+      if (n > 0) {                                                             \
+        MIRROR_SIMD(src_y, dst_y + r * BPP, n);                                \
+      }                                                                        \
+      MIRROR_C(src_y + n * BPP, dst_y, r);                                     \
+    }
+
+#ifdef HAS_MIRRORROW_AVX2
+MANY(MirrorRow_Any_AVX2, MirrorRow_AVX2, MirrorRow_C, 1, 31)
+#endif
+#ifdef HAS_MIRRORROW_SSSE3
+MANY(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, MirrorRow_C, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_SSE2
+MANY(MirrorRow_Any_SSE2, MirrorRow_SSE2, MirrorRow_C, 1, 15)
+#endif
+#ifdef HAS_MIRRORROW_NEON
+MANY(MirrorRow_Any_NEON, MirrorRow_NEON, MirrorRow_C, 1, 15)
+#endif
+#ifdef HAS_ARGBMIRRORROW_AVX2
+MANY(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, ARGBMirrorRow_C, 4, 7)
+#endif
+#ifdef HAS_ARGBMIRRORROW_SSE2
+MANY(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, ARGBMirrorRow_C, 4, 3)
+#endif
+#ifdef HAS_ARGBMIRRORROW_NEON
+MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3)
+#endif
+#undef MANY
+
+#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK)                            \
+    void NAMEANY(const uint8* src_y, uint8* dst_y, int width) {                \
+      int n = width & ~MASK;                                                   \
+      int r = width & MASK;                                                    \
+      if (n > 0) {                                                             \
+        COPY_SIMD(src_y, dst_y, n);                                            \
+      }                                                                        \
+      COPY_C(src_y + n * BPP, dst_y + n * BPP, r);                             \
+    }
+
+#ifdef HAS_COPYROW_AVX
+MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63)
+#endif
+#ifdef HAS_COPYROW_SSE2
+MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31)
+#endif
+#ifdef HAS_COPYROW_NEON
+MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
+#endif
+#undef MANY
+
+#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK)                         \
+    void NAMEANY(uint8* dst_y, T v8, int width) {                              \
+      int n = width & ~MASK;                                                   \
+      int r = width & MASK;                                                    \
+      if (n > 0) {                                                             \
+        SET_SIMD(dst_y, v8, n);                                                \
+      }                                                                        \
+      SET_C(dst_y + n * BPP, v8, r);                                           \
+    }
+
+#ifdef HAS_SETROW_X86
+SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3)
+#endif
+#ifdef HAS_SETROW_NEON
+SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15)
+#endif
+#ifdef HAS_ARGBSETROW_NEON
+SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3)
+#endif
+#undef SETANY
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc
index fa2b752a2ae..e0e2bf4261d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_common.cc
@@ -199,6 +199,32 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
+void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
+                             const uint8* dither8x8, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    int dither0 = dither8x8[x & 7] - 128;
+    int dither1 = dither8x8[(x & 7) + 1] - 128;
+    uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+    uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+    uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+    uint8 b1 = Clamp(src_argb[4] + dither1) >> 3;
+    uint8 g1 = Clamp(src_argb[5] + dither1) >> 2;
+    uint8 r1 = Clamp(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    dst_rgb += 4;
+    src_argb += 8;
+  }
+  if (width & 1) {
+    int dither0 = dither8x8[(width - 1) & 7] - 128;
+    uint8 b0 = Clamp(src_argb[0] + dither0) >> 3;
+    uint8 g0 = Clamp(src_argb[1] + dither0) >> 2;
+    uint8 r0 = Clamp(src_argb[2] + dither0) >> 3;
+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+  }
+}
+
 void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -385,6 +411,28 @@ void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 #undef MAKEROWYJ
 
+void ARGBToUVJ422Row_C(const uint8* src_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+    src_argb += 8;
+    dst_u += 1;
+    dst_v += 1;
+  }
+  if (width & 1) {
+    uint8 ab = src_argb[0];
+    uint8 ag = src_argb[1];
+    uint8 ar = src_argb[2];
+    dst_u[0] = RGBToUJ(ar, ag, ab);
+    dst_v[0] = RGBToVJ(ar, ag, ab);
+  }
+}
+
 void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
@@ -938,33 +986,52 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
   }
 }
 
-// C reference code that mimics the YUV assembly.
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+// TODO(fbarchard): Consider moving constants into a common header.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
 
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
 
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
 
+// C reference code that mimics the YUV assembly.
 static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
                               uint8* b, uint8* g, uint8* r) {
-  int32 y1 = ((int32)(y) - 16) * YG;
-  *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);
-  *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);
-  *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(BB - (         u * UB) + y1) >> 6);
+  *g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6);
+  *r = Clamp((int32)(BR - (v * VR         ) + y1) >> 6);
 }
 
+// C reference code that mimics the YUV assembly.
+static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32)(y1 - YGB) >> 6);
+  *g = Clamp((int32)(y1 - YGB) >> 6);
+  *r = Clamp((int32)(y1 - YGB) >> 6);
+}
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
 #if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+    (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 // C mimic assembly.
 // TODO(fbarchard): Remove subsampling from Neon.
 void I444ToARGBRow_C(const uint8* src_y,
@@ -1008,6 +1075,7 @@ void I444ToARGBRow_C(const uint8* src_y,
   }
 }
 #endif
+
 // Also used for 420
 void I422ToARGBRow_C(const uint8* src_y,
                      const uint8* src_u,
@@ -1034,6 +1102,59 @@ void I422ToARGBRow_C(const uint8* src_y,
   }
 }
 
+// C reference code that mimics the YUV assembly.
+// *  R = Y                + 1.40200 * Cr
+// *  G = Y - 0.34414 * Cb - 0.71414 * Cr
+// *  B = Y + 1.77200 * Cb
+
+#define YGJ 64 /* (int8)round(1.000 * 64) */
+
+#define UBJ 113 /* (int8)round(1.772 * 64) */
+#define UGJ -22 /* (int8)round(-0.34414 * 64) */
+#define URJ 0
+
+#define VBJ 0
+#define VGJ -46 /* (int8)round(-0.71414 * 64) */
+#define VRJ 90 /* (int8)round(1.402 * 64) */
+
+// Bias
+#define BBJ (UBJ * 128 + VBJ * 128)
+#define BGJ (UGJ * 128 + VGJ * 128)
+#define BRJ (URJ * 128 + VRJ * 128)
+
+static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
+                              uint8* b, uint8* g, uint8* r) {
+  uint32 y1 = (uint32)(y * YGJ);
+  *b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6);
+  *g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6);
+  *r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6);
+}
+
+void J422ToARGBRow_C(const uint8* src_y,
+                     const uint8* src_u,
+                     const uint8* src_v,
+                     uint8* rgb_buf,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+    YuvJPixel(src_y[1], src_u[0], src_v[0],
+              rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvJPixel(src_y[0], src_u[0], src_v[0],
+              rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    rgb_buf[3] = 255;
+  }
+}
+
 void I422ToRGB24Row_C(const uint8* src_y,
                       const uint8* src_u,
                       const uint8* src_v,
@@ -1470,18 +1591,15 @@ void I422ToRGBARow_C(const uint8* src_y,
 void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], 128, 128,
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], 128, 128,
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+    YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
     rgb_buf[7] = 255;
     src_y += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], 128, 128,
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+    YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
     rgb_buf[3] = 255;
   }
 }
@@ -1569,28 +1687,15 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
   memcpy(dst, src, count * 2);
 }
 
-void SetRow_C(uint8* dst, uint32 v8, int count) {
-#ifdef _MSC_VER
-  // VC will generate rep stosb.
-  int x;
-  for (x = 0; x < count; ++x) {
-    dst[x] = v8;
-  }
-#else
-  memset(dst, v8, count);
-#endif
+void SetRow_C(uint8* dst, uint8 v8, int width) {
+  memset(dst, v8, width);
 }
 
-void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
-                 int dst_stride, int height) {
-  int y;
-  for (y = 0; y < height; ++y) {
-    uint32* d = (uint32*)(dst);
-    int x;
-    for (x = 0; x < width; ++x) {
-      d[x] = v32;
-    }
-    dst += dst_stride;
+void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
+  uint32* d = (uint32*)(dst_argb);
+  int x;
+  for (x = 0; x < width; ++x) {
+    d[x] = v32;
   }
 }
 
@@ -1885,17 +1990,17 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
   }
 }
 
-// Blend 2 rows into 1 for conversions such as I422ToI420.
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-               uint8* dst_uv, int pix) {
+// Blend 2 rows into 1.
+static void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+                      uint8* dst_uv, int pix) {
   int x;
   for (x = 0; x < pix; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
   }
 }
 
-void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
-                  uint16* dst_uv, int pix) {
+static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
+                         uint16* dst_uv, int pix) {
   int x;
   for (x = 0; x < pix; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
@@ -1957,24 +2062,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
   }
 }
 
-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
-void ARGBToBayerRow_C(const uint8* src_argb,
-                      uint8* dst_bayer, uint32 selector, int pix) {
-  int index0 = selector & 0xff;
-  int index1 = (selector >> 8) & 0xff;
-  // Copy a row of Bayer.
-  int x;
-  for (x = 0; x < pix - 1; x += 2) {
-    dst_bayer[0] = src_argb[index0];
-    dst_bayer[1] = src_argb[index1];
-    src_argb += 8;
-    dst_bayer += 2;
-  }
-  if (pix & 1) {
-    dst_bayer[0] = src_argb[index0];
-  }
-}
-
 // Select G channel from ARGB.  e.g.  GGGGGGGG
 void ARGBToBayerGGRow_C(const uint8* src_argb,
                         uint8* dst_bayer, uint32 selector, int pix) {
@@ -2061,122 +2148,272 @@ void I422ToUYVYRow_C(const uint8* src_y,
   }
 }
 
-#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
+// Maximum temporary width for wrappers to process at a time, in pixels.
+#define MAXTWIDTH 2048
+
+#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
-#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
 void I422ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
-                           uint8* rgb_buf,
+                           uint8* dst_rgb565,
                            int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
-  ARGBToRGB565Row_SSE2(row, rgb_buf, width);
-  free_aligned_buffer_64(row);
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
 }
-#endif  // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))
+#endif
 
-#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
-                             uint8* rgb_buf,
+                             uint8* dst_argb1555,
                              int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
-  ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
-  free_aligned_buffer_64(row);
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
 void I422ToARGB4444Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
-                             uint8* rgb_buf,
+                             uint8* dst_argb4444,
                              int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
-  ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
-  free_aligned_buffer_64(row);
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_rgb565,
-                           int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
-  free_aligned_buffer_64(row);
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-void NV21ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_vu,
-                           uint8* dst_rgb565,
-                           int width) {
-  // Allocate a row of ARGB.
-  align_buffer_64(row, width * 4);
-  NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
-  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
-  free_aligned_buffer_64(row);
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu,
+                           uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
 }
+#endif
 
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
-                         int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
-  YUY2ToYRow_SSE2(src_yuy2, row_y, width);
-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
-}
-
-void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
-                                   uint8* dst_argb,
-                                   int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
-  YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
-  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
-}
-
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
-                         int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
-  UYVYToYRow_SSE2(src_uyvy, row_y, width);
-  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
-}
-
-void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
-                                   uint8* dst_argb,
-                                   int width) {
-  // Allocate a rows of yuv.
-  align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-  uint8* row_u = row_y + ((width + 63) & ~63);
-  uint8* row_v = row_u + ((width + 63) & ~63) / 2;
-  UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
-  UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
-  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
-  free_aligned_buffer_64(row_y);
-}
-
-#endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_SSE2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_SSE2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif  // !defined(LIBYUV_DISABLE_X86)
+
+#if defined(HAS_I422TORGB565ROW_AVX2)
+void I422ToRGB565Row_AVX2(const uint8* src_y,
+                          const uint8* src_u,
+                          const uint8* src_v,
+                          uint8* dst_rgb565,
+                          int width) {
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB1555ROW_AVX2)
+void I422ToARGB1555Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb1555,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb1555 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_I422TOARGB4444ROW_AVX2)
+void I422ToARGB4444Row_AVX2(const uint8* src_y,
+                            const uint8* src_u,
+                            const uint8* src_v,
+                            uint8* dst_argb4444,
+                            int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth);
+    ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth);
+    src_y += twidth;
+    src_u += twidth / 2;
+    src_v += twidth / 2;
+    dst_argb4444 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB565ROW_AVX2)
+void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB565ROW_AVX2)
+void NV21ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_vu,
+                          uint8* dst_rgb565, int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, twidth);
+    ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb565 += twidth * 2;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_YUY2TOARGBROW_AVX2)
+void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    YUY2ToUV422Row_AVX2(src_yuy2, row_u, row_v, twidth);
+    YUY2ToYRow_AVX2(src_yuy2, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_yuy2 += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_UYVYTOARGBROW_AVX2)
+void UYVYToARGBRow_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width) {
+  // Row buffers for intermediate YUV pixels.
+  SIMD_ALIGNED32(uint8 row_y[MAXTWIDTH]);
+  SIMD_ALIGNED32(uint8 row_u[MAXTWIDTH / 2]);
+  SIMD_ALIGNED32(uint8 row_v[MAXTWIDTH / 2]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    UYVYToUV422Row_AVX2(src_uyvy, row_u, row_v, twidth);
+    UYVYToYRow_AVX2(src_uyvy, row_y, twidth);
+    I422ToARGBRow_AVX2(row_y, row_u, row_v, dst_argb, twidth);
+    src_uyvy += twidth * 2;
+    dst_argb += twidth * 4;
+    width -= twidth;
+  }
+}
 #endif  // !defined(LIBYUV_DISABLE_X86)
 
 void ARGBPolynomialRow_C(const uint8* src_argb,
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc
index ae9370c1b02..cfc9ffe0368 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_mips.cc
@@ -378,7 +378,7 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
 // MIPS DSPR2 functions
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
     (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
+    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
 
 void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                            int width) {
@@ -447,89 +447,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
   );
 }
 
-void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
-                                     uint8* dst_v, int width) {
-  __asm__ __volatile__ (
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-    "blez            $t4, 2f                       \n"
-    " andi           %[width], %[width], 0xf       \n"  // residual
-
-    ".p2align        2                             \n"
-  "1:                                              \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lwr             $t0, 0(%[src_uv])             \n"
-    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lwr             $t1, 4(%[src_uv])             \n"
-    "lwl             $t1, 7(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lwr             $t2, 8(%[src_uv])             \n"
-    "lwl             $t2, 11(%[src_uv])            \n"  // V5 | U5 | V4 | U4
-    "lwr             $t3, 12(%[src_uv])            \n"
-    "lwl             $t3, 15(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lwr             $t5, 16(%[src_uv])            \n"
-    "lwl             $t5, 19(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lwr             $t6, 20(%[src_uv])            \n"
-    "lwl             $t6, 23(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lwr             $t7, 24(%[src_uv])            \n"
-    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lwr             $t8, 28(%[src_uv])            \n"
-    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "swr             $t9, 0(%[dst_v])              \n"
-    "swl             $t9, 3(%[dst_v])              \n"
-    "swr             $t0, 0(%[dst_u])              \n"
-    "swl             $t0, 3(%[dst_u])              \n"
-    "swr             $t1, 4(%[dst_v])              \n"
-    "swl             $t1, 7(%[dst_v])              \n"
-    "swr             $t2, 4(%[dst_u])              \n"
-    "swl             $t2, 7(%[dst_u])              \n"
-    "swr             $t3, 8(%[dst_v])              \n"
-    "swl             $t3, 11(%[dst_v])             \n"
-    "swr             $t5, 8(%[dst_u])              \n"
-    "swl             $t5, 11(%[dst_u])             \n"
-    "swr             $t6, 12(%[dst_v])             \n"
-    "swl             $t6, 15(%[dst_v])             \n"
-    "swr             $t7, 12(%[dst_u])             \n"
-    "swl             $t7, 15(%[dst_u])             \n"
-    "addiu           %[dst_u], %[dst_u], 16        \n"
-    "bgtz            $t4, 1b                       \n"
-    " addiu          %[dst_v], %[dst_v], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-  "2:                                              \n"
-    "lbu             $t0, 0(%[src_uv])             \n"
-    "lbu             $t1, 1(%[src_uv])             \n"
-    "addiu           %[src_uv], %[src_uv], 2       \n"
-    "addiu           %[width], %[width], -1        \n"
-    "sb              $t0, 0(%[dst_u])              \n"
-    "sb              $t1, 0(%[dst_v])              \n"
-    "addiu           %[dst_u], %[dst_u], 1         \n"
-    "bgtz            %[width], 2b                  \n"
-    " addiu          %[dst_v], %[dst_v], 1         \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-     : [src_uv] "+r" (src_uv),
-       [width] "+r" (width),
-       [dst_u] "+r" (dst_u),
-       [dst_v] "+r" (dst_v)
-     :
-     : "t0", "t1", "t2", "t3",
-     "t4", "t5", "t6", "t7", "t8", "t9"
-  );
-}
-
 void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
   __asm__ __volatile__ (
     ".set push                             \n"
@@ -927,9 +844,9 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
 }
 
 // Bilinear filter 8x2 -> 8x1
-void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                                ptrdiff_t src_stride, int dst_width,
-                                int source_y_fraction) {
+void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+                               ptrdiff_t src_stride, int dst_width,
+                               int source_y_fraction) {
     int y0_fraction = 256 - source_y_fraction;
     const uint8* src_ptr1 = src_ptr + src_stride;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc
index 1392cf5fcc3..8badc5a9b94 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon.cc
@@ -16,7 +16,8 @@ extern "C" {
 #endif
 
 // This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                                                             \
@@ -92,36 +93,73 @@ extern "C" {
     "vuzp.u8    d2, d3                         \n"                             \
     "vtrn.u32   d2, d3                         \n"
 
+#define YUV422TORGB_SETUP_REG                                                  \
+    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
+    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
+    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
+    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
+    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
+    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
 #define YUV422TORGB                                                            \
-    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
-    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
-    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
-    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
-    "vtrn.u8    d0, d1                         \n"                             \
-    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
-    "vmul.s16   q0, q0, q14                    \n"                             \
+    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
+    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
+    "vmovl.u8   q0, d0                         \n" /* Y                      */\
+    "vmovl.s16  q10, d1                        \n"                             \
+    "vmovl.s16  q0, d0                         \n"                             \
+    "vmul.s32   q10, q10, q15                  \n"                             \
+    "vmul.s32   q0, q0, q15                    \n"                             \
+    "vqshrun.s32 d0, q0, #16                   \n"                             \
+    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
     "vadd.s16   d18, d19                       \n"                             \
-    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \
-    "vqadd.s16  d21, d1, d16                   \n"                             \
-    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \
-    "vqadd.s16  d23, d1, d17                   \n"                             \
-    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \
-    "vqadd.s16  d17, d1, d18                   \n"                             \
-    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \
-    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \
-    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \
-    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
-    "vmovl.u8   q11, d1                        \n"                             \
-    "vmovl.u8   q8, d2                         \n"                             \
-    "vtrn.u8    d20, d21                       \n"                             \
-    "vtrn.u8    d22, d23                       \n"                             \
-    "vtrn.u8    d16, d17                       \n"                             \
-    "vmov.u8    d21, d16                       \n"
-
-static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
-                         0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
-                       0, 0, 0, 0, 0, 0, 0, 0 };
+    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
+    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
+    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
+    "vaddw.u16  q1, q1, d16                    \n"                             \
+    "vaddw.u16  q10, q10, d17                  \n"                             \
+    "vaddw.u16  q3, q3, d18                    \n"                             \
+    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
+    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
+    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
+    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
+    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
+    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
+    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
+    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
+    "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static uvec8 kUVToRB  = { 128, 128, 128, 128, 102, 102, 102, 102,
+                          0, 0, 0, 0, 0, 0, 0, 0 };
+static uvec8 kUVToG = { 25, 25, 25, 25, 52, 52, 52, 52,
+                        0, 0, 0, 0, 0, 0, 0, 0 };
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
 
 void I444ToARGBRow_NEON(const uint8* src_y,
                         const uint8* src_u,
@@ -129,13 +167,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV444
@@ -150,8 +182,10 @@ void I444ToARGBRow_NEON(const uint8* src_y,
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -163,13 +197,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -184,8 +212,10 @@ void I422ToARGBRow_NEON(const uint8* src_y,
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -197,13 +227,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV411
@@ -218,8 +242,10 @@ void I411ToARGBRow_NEON(const uint8* src_y,
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -231,13 +257,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
                         uint8* dst_bgra,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -253,8 +273,10 @@ void I422ToBGRARow_NEON(const uint8* src_y,
       "+r"(src_v),     // %2
       "+r"(dst_bgra),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -266,13 +288,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
                         uint8* dst_abgr,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -288,8 +304,10 @@ void I422ToABGRRow_NEON(const uint8* src_y,
       "+r"(src_v),     // %2
       "+r"(dst_abgr),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -301,13 +319,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
                         uint8* dst_rgba,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -322,8 +334,10 @@ void I422ToRGBARow_NEON(const uint8* src_y,
       "+r"(src_v),     // %2
       "+r"(dst_rgba),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -335,13 +349,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
                          uint8* dst_rgb24,
                          int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -355,8 +363,10 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
       "+r"(src_v),      // %2
       "+r"(dst_rgb24),  // %3
       "+r"(width)       // %4
-    : "r"(&kUVToRB),    // %5
-      "r"(&kUVToG)      // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -368,13 +378,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
                        uint8* dst_raw,
                        int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -389,8 +393,10 @@ void I422ToRAWRow_NEON(const uint8* src_y,
       "+r"(src_v),    // %2
       "+r"(dst_raw),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -414,13 +420,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -435,8 +435,10 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
       "+r"(src_v),    // %2
       "+r"(dst_rgb565),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -463,13 +465,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
                             uint8* dst_argb1555,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
@@ -485,8 +481,10 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
       "+r"(src_v),    // %2
       "+r"(dst_argb1555),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -507,13 +505,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
                             uint8* dst_argb4444,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
     ".p2align   2                              \n"
   "1:                                          \n"
@@ -530,8 +522,10 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
       "+r"(src_v),    // %2
       "+r"(dst_argb4444),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
+    : [kUVToRB]"r"(&kUVToRB),   // %5
+      [kUVToG]"r"(&kUVToG),     // %6
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -541,13 +535,7 @@ void YToARGBRow_NEON(const uint8* src_y,
                      uint8* dst_argb,
                      int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUV400
@@ -560,8 +548,10 @@ void YToARGBRow_NEON(const uint8* src_y,
     : "+r"(src_y),     // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -595,13 +585,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV12
@@ -615,8 +599,10 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -627,13 +613,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV21
@@ -647,8 +627,10 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -659,13 +641,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV12
@@ -679,8 +655,10 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -691,13 +669,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READNV21
@@ -711,8 +683,10 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
+    : [kUVToRB]"r"(&kUVToRB),   // %4
+      [kUVToG]"r"(&kUVToG),     // %5
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -722,13 +696,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READYUY2
@@ -741,8 +709,10 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
     : "+r"(src_yuy2),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -752,13 +722,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
+    YUV422TORGB_SETUP_REG
     ".p2align   2                              \n"
   "1:                                          \n"
     READUYVY
@@ -771,8 +735,10 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
     : "+r"(src_uyvy),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
+    : [kUVToRB]"r"(&kUVToRB),   // %3
+      [kUVToG]"r"(&kUVToG),     // %4
+      [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
     : "cc", "memory", "q0", "q1", "q2", "q3",
       "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
   );
@@ -844,30 +810,36 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   );
 }
 
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
   asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-    "1:                                        \n"
+    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+  "1:                                          \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
     MEMACCESS(0)
     "vst1.8    {q0}, [%0]!                     \n"  // store
     "bgt       1b                              \n"
   : "+r"(dst),   // %0
     "+r"(count)  // %1
-  : "r"(v32)     // %2
+  : "r"(v8)      // %2
   : "cc", "memory", "q0"
   );
 }
 
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
-                      int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    SetRow_NEON(dst, v32, width << 2);
-    dst += dst_stride;
-  }
+// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+    MEMACCESS(0)
+    "vst1.8    {q0}, [%0]!                     \n"  // store
+    "bgt       1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "q0"
+  );
 }
 
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
@@ -1273,53 +1245,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
   );
 }
 
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.
-    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),         // %0
-    "+r"(src_uv_stride),  // %1
-    "+r"(dst_uv),         // %2
-    "+r"(pix)             // %3
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) {
-  asm volatile (
-    "vmov.u32   d6[0], %3                      \n"  // selector
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels
-    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels
-    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels
-    MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"  // store 8.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_bayer),  // %1
-    "+r"(pix)         // %2
-  : "r"(selector)     // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
 // Select G channels from ARGB.  e.g.  GGGGGGGG
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                            uint32 /*selector*/, int pix) {
@@ -2832,7 +2757,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
     "vmovl.u8   q9, d18                        \n"  // g
     "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q15, d22                       \n"  // a
+    "vmovl.u8   q11, d22                       \n"  // a
     "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
     "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
     "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
@@ -2853,10 +2778,10 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
     "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
+    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
@@ -2872,7 +2797,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
     "+r"(dst_argb),   // %1
     "+r"(width)       // %2
   : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
     "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -3140,7 +3065,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
   : "cc", "memory", "q0", "q1"  // Clobber List
   );
 }
-#endif  // __ARM_NEON__
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc
index 21111cf60f3..ddccd5d98b7 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_neon64.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -15,113 +15,157 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for GCC Neon
+// This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
 #define READYUV422                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
+    "ld1        {v1.s}[0], [%1], #4            \n"                             \
     MEMACCESS(2)                                                               \
-    "vld1.32    {d2[1]}, [%2]!                 \n"
+    "ld1        {v1.s}[1], [%2], #4            \n"
 
 // Read 8 Y, 2 U and 2 V from 422
 #define READYUV411                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
+    "ld1        {v2.h}[0], [%1], #2            \n"                             \
     MEMACCESS(2)                                                               \
-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vzip.u8    d2, d3                         \n"
+    "ld1        {v2.h}[1], [%2], #2            \n"                             \
+    "zip1       v1.8b, v2.8b, v2.8b            \n"
 
 // Read 8 Y, 8 U and 8 V from 444
 #define READYUV444                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
+    "ld1        {v1.d}[0], [%1], #8            \n"                             \
     MEMACCESS(2)                                                               \
-    "vld1.8     {d3}, [%2]!                    \n"                             \
-    "vpaddl.u8  q1, q1                         \n"                             \
-    "vrshrn.u16 d2, q1, #1                     \n"
+    "ld1        {v1.d}[1], [%2], #8            \n"                             \
+    "uaddlp     v1.8h, v1.16b                  \n"                             \
+    "rshrn      v1.8b, v1.8h, #1               \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
 #define READYUV400                                                             \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    "vmov.u8    d2, #128                       \n"
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
+    "movi       v1.8b , #128                   \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
     MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
+    "ld1        {v0.8b}, [%0], #8              \n"                             \
     MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d3, d2                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+    "ld1        {v2.8b}, [%1], #8              \n"                             \
+    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 YUY2
 #define READYUY2                                                               \
     MEMACCESS(0)                                                               \
-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
+    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
+    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 UYVY
 #define READUYVY                                                               \
     MEMACCESS(0)                                                               \
-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
-    "vmov.u8    d0, d3                         \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
-
-#define YUV422TORGB                                                            \
-    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\
-    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\
-    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\
-    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\
-    "vtrn.u8    d0, d1                         \n"                             \
-    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\
-    "vmul.s16   q0, q0, q14                    \n"                             \
-    "vadd.s16   d18, d19                       \n"                             \
-    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \
-    "vqadd.s16  d21, d1, d16                   \n"                             \
-    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \
-    "vqadd.s16  d23, d1, d17                   \n"                             \
-    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \
-    "vqadd.s16  d17, d1, d18                   \n"                             \
-    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \
-    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \
-    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \
-    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\
-    "vmovl.u8   q11, d1                        \n"                             \
-    "vmovl.u8   q8, d2                         \n"                             \
-    "vtrn.u8    d20, d21                       \n"                             \
-    "vtrn.u8    d22, d23                       \n"                             \
-    "vtrn.u8    d16, d17                       \n"                             \
-    "vmov.u8    d21, d16                       \n"
-
-static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
-                         0, 0, 0, 0, 0, 0, 0, 0 };
-static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
-                       0, 0, 0, 0, 0, 0, 0, 0 };
+    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
+    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
+    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
+    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
+    "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUV422TORGB_SETUP_REG                                                  \
+    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
+    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
+    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
+    "movi       v27.8h, #128                   \n"                             \
+    "movi       v28.8h, #102                   \n"                             \
+    "movi       v29.8h, #25                    \n"                             \
+    "movi       v30.8h, #52                    \n"
+
+#define YUV422TORGB(vR, vG, vB)                                                \
+    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
+    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
+    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
+    "ushll      v0.4s, v0.4h, #0               \n"                             \
+    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
+    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
+    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
+    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
+    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
+    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
+    "uxtl       v2.8h, v2.8b                   \n"                             \
+    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
+    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
+    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
+    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
+    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
+    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
+    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
+    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
+    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
+    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
+    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
+    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
+    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
+    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
+    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG                                                      \
+    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
+    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
+    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
+    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
+    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
+    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+
 
 #ifdef HAS_I444TOARGBROW_NEON
 void I444ToARGBRow_NEON(const uint8* src_y,
@@ -130,31 +174,24 @@ void I444ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV444
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I444TOARGBROW_NEON
@@ -166,31 +203,24 @@ void I422ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOARGBROW_NEON
@@ -202,31 +232,24 @@ void I411ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV411
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_argb),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I411TOARGBROW_NEON
@@ -238,32 +261,24 @@ void I422ToBGRARow_NEON(const uint8* src_y,
                         uint8* dst_bgra,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v21, v22, v23)
     "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vmov.u8    d19, #255                      \n"
+    "movi       v20.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_bgra),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOBGRAROW_NEON
@@ -275,32 +290,24 @@ void I422ToABGRRow_NEON(const uint8* src_y,
                         uint8* dst_abgr,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v20, v21, v22)
     "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_abgr),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOABGRROW_NEON
@@ -312,31 +319,24 @@ void I422ToRGBARow_NEON(const uint8* src_y,
                         uint8* dst_rgba,
                         int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v23, v22, v21)
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d19, #255                      \n"
+    "movi       v20.8b, #255                   \n" /* A */
     MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_u),     // %1
       "+r"(src_v),     // %2
       "+r"(dst_rgba),  // %3
       "+r"(width)      // %4
-    : "r"(&kUVToRB),   // %5
-      "r"(&kUVToG)     // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORGBAROW_NEON
@@ -348,30 +348,23 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
                          uint8* dst_rgb24,
                          int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %4, %4, #8                     \n"
     MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_u),      // %1
-      "+r"(src_v),      // %2
-      "+r"(dst_rgb24),  // %3
-      "+r"(width)       // %4
-    : "r"(&kUVToRB),    // %5
-      "r"(&kUVToG)      // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_rgb24), // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORGB24ROW_NEON
@@ -383,46 +376,33 @@ void I422ToRAWRow_NEON(const uint8* src_y,
                        uint8* dst_raw,
                        int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v20, v21, v22)
     "subs       %4, %4, #8                     \n"
-    "vswp.u8    d20, d22                       \n"
     MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_raw),  // %3
-      "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
+    "b.gt       1b                             \n"
+    : "+r"(src_y),     // %0
+      "+r"(src_u),     // %1
+      "+r"(src_v),     // %2
+      "+r"(dst_raw),   // %3
+      "+r"(width)      // %4
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORAWROW_NEON
 
 #define ARGBTORGB565                                                           \
-    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \
-    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
-    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \
-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
-    "vorr       q0, q0, q10                    \n"  /* BGR                  */
+    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
 
 #ifdef HAS_I422TORGB565ROW_NEON
 void I422ToRGB565Row_NEON(const uint8* src_y,
@@ -431,49 +411,36 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %4, %4, #8                     \n"
     ARGBTORGB565
     MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
       "+r"(dst_rgb565),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TORGB565ROW_NEON
 
 #define ARGBTOARGB1555                                                         \
-    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \
-    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \
-    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \
-    "vmovl.u8   q8, d20                        \n"  /* B                    */ \
-    "vmovl.u8   q9, d21                        \n"  /* G                    */ \
-    "vmovl.u8   q10, d22                       \n"  /* R                    */ \
-    "vmovl.u8   q11, d23                       \n"  /* A                    */ \
-    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \
-    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \
-    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \
-    "vorr       q0, q8, q9                     \n"  /* BG                   */ \
-    "vorr       q1, q10, q11                   \n"  /* RA                   */ \
-    "vorr       q0, q0, q1                     \n"  /* BGRA                 */
+    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
+    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
+    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
+    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
+    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
+    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
+    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
 
 #ifdef HAS_I422TOARGB1555ROW_NEON
 void I422ToARGB1555Row_NEON(const uint8* src_y,
@@ -482,44 +449,38 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
                             uint8* dst_argb1555,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
     ARGBTOARGB1555
     MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
       "+r"(dst_argb1555),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOARGB1555ROW_NEON
 
 #define ARGBTOARGB4444                                                         \
-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
+    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
+    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
+    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
+    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
+    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
+    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
+    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
 
 #ifdef HAS_I422TOARGB4444ROW_NEON
 void I422ToARGB4444Row_NEON(const uint8* src_y,
@@ -528,33 +489,26 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
                             uint8* dst_argb4444,
                             int width) {
   asm volatile (
-    MEMACCESS(5)
-    "vld1.8     {d24}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {d25}, [%6]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
   "1:                                          \n"
     READYUV422
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
     ARGBTOARGB4444
     MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
       "+r"(src_u),    // %1
       "+r"(src_v),    // %2
       "+r"(dst_argb4444),  // %3
       "+r"(width)     // %4
-    : "r"(&kUVToRB),  // %5
-      "r"(&kUVToG)    // %6
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_I422TOARGB4444ROW_NEON
@@ -564,29 +518,22 @@ void YToARGBRow_NEON(const uint8* src_y,
                      uint8* dst_argb,
                      int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUV400
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_YTOARGBROW_NEON
@@ -596,22 +543,21 @@ void I400ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    ".p2align   2                              \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d20}, [%0]!                   \n"
-    "vmov       d21, d20                       \n"
-    "vmov       d22, d20                       \n"
+    "ld1        {v20.8b}, [%0], #8             \n"
+    "orr        v21.8b, v20.8b, v20.8b         \n"
+    "orr        v22.8b, v20.8b, v20.8b         \n"
     "subs       %2, %2, #8                     \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
     :
-    : "cc", "memory", "d20", "d21", "d22", "d23"
+    : "cc", "memory", "v20", "v21", "v22", "v23"
   );
 }
 #endif  // HAS_I400TOARGBROW_NEON
@@ -622,30 +568,23 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV12
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV12TOARGBROW_NEON
@@ -656,30 +595,23 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV21
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %3, %3, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_argb),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV21TOARGBROW_NEON
@@ -690,30 +622,23 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV12
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %3, %3, #8                     \n"
     ARGBTORGB565
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV12TORGB565ROW_NEON
@@ -724,30 +649,23 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
                           uint8* dst_rgb565,
                           int width) {
   asm volatile (
-    MEMACCESS(4)
-    "vld1.8     {d24}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {d25}, [%5]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READNV21
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %3, %3, #8                     \n"
     ARGBTORGB565
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
       "+r"(dst_rgb565),  // %2
       "+r"(width)      // %3
-    : "r"(&kUVToRB),   // %4
-      "r"(&kUVToG)     // %5
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_NV21TORGB565ROW_NEON
@@ -757,29 +675,22 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READYUY2
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "b.gt       1b                             \n"
     : "+r"(src_yuy2),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_YUY2TOARGBROW_NEON
@@ -789,29 +700,22 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {d24}, [%3]                    \n"
-    MEMACCESS(4)
-    "vld1.8     {d25}, [%4]                    \n"
-    "vmov.u8    d26, #128                      \n"
-    "vmov.u16   q14, #74                       \n"
-    "vmov.u16   q15, #16                       \n"
-    ".p2align   2                              \n"
+    YUV422TORGB_SETUP_REG
   "1:                                          \n"
     READUYVY
-    YUV422TORGB
+    YUV422TORGB(v22, v21, v20)
     "subs       %2, %2, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
+    "movi       v23.8b, #255                   \n"
     MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "b.gt       1b                             \n"
     : "+r"(src_uyvy),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
-    : "r"(&kUVToRB),   // %3
-      "r"(&kUVToG)     // %4
-    : "cc", "memory", "q0", "q1", "q2", "q3",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+    : [kUVBiasBGR]"r"(&kUVBiasBGR),
+      [kYToRgb]"r"(&kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_UYVYTOARGBROW_NEON
@@ -821,16 +725,15 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                      int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(1)
     "st1        {v0.16b}, [%1], #16            \n"  // store U
     MEMACCESS(2)
     "st1        {v1.16b}, [%2], #16            \n"  // store V
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
     : "+r"(src_uv),  // %0
       "+r"(dst_u),   // %1
       "+r"(dst_v),   // %2
@@ -846,7 +749,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v0.16b}, [%0], #16            \n"  // load U
@@ -854,8 +756,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     "ld1        {v1.16b}, [%1], #16            \n"  // load V
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
     MEMACCESS(2)
-    "st2        {v0.16b, v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
-    "bgt        1b                             \n"
+    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+    "b.gt       1b                             \n"
     :
       "+r"(src_u),   // %0
       "+r"(src_v),   // %1
@@ -871,14 +773,13 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #ifdef HAS_COPYROW_NEON
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld1        {v0.8b-v3.8b}, [%0], #32       \n"  // load 32
+    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
     "subs       %2, %2, #32                    \n"  // 32 processed per loop
     MEMACCESS(1)
-    "st1        {v0.8b-v3.8b}, [%1], #32       \n"  // store 32
-    "bgt        1b                             \n"
+    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
+    "b.gt       1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(count)  // %2  // Output registers
@@ -888,35 +789,36 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_NEON
 
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
-#ifdef HAS_SETROW_NEON
-void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+// SetRow writes 'count' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8* dst, uint8 v8, int count) {
   asm volatile (
-    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
-    "1:                                        \n"
+    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+  "1:                                          \n"
     "subs      %1, %1, #16                     \n"  // 16 bytes per loop
     MEMACCESS(0)
     "st1        {v0.16b}, [%0], #16            \n"  // store
-    "bgt       1b                              \n"
+    "b.gt      1b                              \n"
   : "+r"(dst),   // %0
     "+r"(count)  // %1
-  : "r"(v32)     // %2
+  : "r"(v8)      // %2
   : "cc", "memory", "v0"
   );
 }
-#endif  // HAS_SETROW_NEON
 
-// TODO(fbarchard): Make fully assembler
-// SetRow32 writes 'count' words using a 32 bit value repeated.
-#ifdef HAS_ARGBSETROWS_NEON
-void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
-                      int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    SetRow_NEON(dst, v32, width << 2);
-    dst += dst_stride;
-  }
+void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
+  asm volatile (
+    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+  "1:                                          \n"
+    "subs      %1, %1, #4                      \n"  // 4 ints per loop
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"  // store
+    "b.gt      1b                              \n"
+  : "+r"(dst),   // %0
+    "+r"(count)  // %1
+  : "r"(v32)     // %2
+  : "cc", "memory", "v0"
+  );
 }
-#endif  // HAS_ARGBSETROWS_NEON
 
 #ifdef HAS_MIRRORROW_NEON
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
@@ -925,7 +827,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     "add        %0, %0, %2                     \n"
     "sub        %0, %0, #16                    \n"
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
@@ -935,7 +836,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
     "st1        {v0.D}[0], [%1], #8            \n"
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
@@ -953,7 +854,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "add        %0, %0, %3, lsl #1             \n"
     "sub        %0, %0, #16                    \n"
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
@@ -961,10 +861,10 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "rev64      v0.8b, v0.8b                   \n"
     "rev64      v1.8b, v1.8b                   \n"
     MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8               \n"  // dst += 8
+    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
     MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8               \n"
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"
+    "b.gt       1b                             \n"
   : "+r"(src_uv),  // %0
     "+r"(dst_u),   // %1
     "+r"(dst_v),   // %2
@@ -982,7 +882,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     "add        %0, %0, %2, lsl #2             \n"
     "sub        %0, %0, #16                    \n"
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
@@ -992,7 +891,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
     MEMACCESS(1)
     "st1        {v0.D}[0], [%1], #8            \n"
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src),   // %0
     "+r"(dst),   // %1
     "+r"(width)  // %2
@@ -1006,14 +905,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   asm volatile (
     "movi       v4.8b, #255                    \n"  // Alpha
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld3        {v1.8b-v3.8b}, [%0], #24       \n"  // load 8 pixels of RGB24.
+    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "st4        {v1.8b-v4.8b}, [%1], #32       \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),   // %1
     "+r"(pix)         // %2
@@ -1027,16 +925,15 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
   asm volatile (
     "movi       v5.8b, #255                    \n"  // Alpha
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld3        {v0.8b-v2.8b}, [%0], #24       \n"  // read r g b
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "mov        v3.8b, v1.8b                   \n"  // move g
-    "mov        v4.8b, v0.8b                   \n"  // move r
+    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
     MEMACCESS(1)
-    "st4        {v2.8b-v5.8b}, [%1], #32       \n"  // store b g r a
-    "bgt        1b                             \n"
+    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+    "b.gt       1b                             \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
@@ -1047,118 +944,127 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
 #endif  // HAS_RAWTOARGBROW_NEON
 
 #define RGB565TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
+    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
+    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
+    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
+    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
+    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
+    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
+    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
 
 #ifdef HAS_RGB565TOARGBROW_NEON
 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
+    "movi       v3.8b, #255                    \n"  // Alpha
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     RGB565TOARGB
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_rgb565),  // %0
     "+r"(dst_argb),    // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
   );
 }
 #endif  // HAS_RGB565TOARGBROW_NEON
 
 #define ARGB1555TOARGB                                                         \
-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
+                                                                               \
+    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
+    "xtn2       v3.16b, v2.8h                  \n"                             \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
+    "dup        v1.2D, v0.D[1]                 \n"                             \
+    "dup        v3.2D, v2.D[1]                 \n"
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
 #define RGB555TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
+    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
+    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
+    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
+                                                                               \
+    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
+    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
+                                                                               \
+    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
+    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
+    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
+                                                                               \
+    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
+    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
+    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
 
 #ifdef HAS_ARGB1555TOARGBROW_NEON
 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
+    "movi       v3.8b, #255                    \n"  // Alpha
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB1555TOARGB
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb1555),  // %0
     "+r"(dst_argb),    // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
 #endif  // HAS_ARGB1555TOARGBROW_NEON
 
 #define ARGB4444TOARGB                                                         \
-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
+    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
+    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
+    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
+    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
+    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
+    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
+    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
+    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
+    "dup        v0.2D, v2.D[1]                 \n"                             \
+    "dup        v1.2D, v3.D[1]                 \n"
 
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
   asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB4444TOARGB
     MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb4444),  // %0
     "+r"(dst_argb),    // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
   );
 }
 #endif  // HAS_ARGB4444TOARGBROW_NEON
@@ -1166,14 +1072,13 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
 #ifdef HAS_ARGBTORGB24ROW_NEON
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load 8 pixels of ARGB.
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     MEMACCESS(1)
-    "st3        {v1.8b-v3.8b}, [%1], #24       \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
+    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_rgb24),  // %1
     "+r"(pix)         // %2
@@ -1186,16 +1091,15 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
 #ifdef HAS_ARGBTORAWROW_NEON
 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v1.8b-v4.8b}, [%0], #32       \n"  // load b g r a
+    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "mov        v4.8b, v2.8b                   \n"  // mov g
-    "mov        v5.8b, v1.8b                   \n"  // mov b
+    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
     MEMACCESS(1)
-    "st3        {v3.8b-v5.8b}, [%1], #24       \n"  // store r g b
-    "bgt        1b                             \n"
+    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_raw),   // %1
     "+r"(pix)        // %2
@@ -1208,14 +1112,13 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
 #ifdef HAS_YUY2TOYROW_NEON
 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
     MEMACCESS(1)
     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1228,14 +1131,13 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
 #ifdef HAS_UYVYTOYROW_NEON
 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
+    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
     MEMACCESS(1)
     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1249,16 +1151,15 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
@@ -1273,16 +1174,15 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
     MEMACCESS(2)
     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
@@ -1296,29 +1196,29 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
 #ifdef HAS_YUY2TOUVROW_NEON
 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
   asm volatile (
-    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_yuy2
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of YUY2.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row YUY2.
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
     MEMACCESS(2)
     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_yuy2),     // %0
-    "+r"(stride_yuy2),  // %1
+    "+r"(src_yuy2b),    // %1
     "+r"(dst_u),        // %2
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_YUY2TOUVROW_NEON
@@ -1326,84 +1226,33 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
 #ifdef HAS_UYVYTOUVROW_NEON
 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
   asm volatile (
-    "add        %x1, %x0, %w1, sxtw            \n"  // stride + src_uyvy
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 16 pixels of UYVY.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
     MEMACCESS(1)
-    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load next row UYVY.
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
     MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
     MEMACCESS(3)
     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_uyvy),     // %0
-    "+r"(stride_uyvy),  // %1
+    "+r"(src_uyvyb),    // %1
     "+r"(dst_u),        // %2
     "+r"(dst_v),        // %3
     "+r"(pix)           // %4
   :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"  // Clobber List
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v5", "v6", "v7"  // Clobber List
   );
 }
 #endif  // HAS_UYVYTOUVROW_NEON
 
-#ifdef HAS_HALFROW_NEON
-void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %x1, %x0, %w1, sxtw            \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load row 1 16 pixels.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load row 2 16 pixels.
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"  // average row 1 and 2
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),         // %0
-    "+r"(src_uv_stride),  // %1
-    "+r"(dst_uv),         // %2
-    "+r"(pix)             // %3
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-#endif  // HAS_HALFROW_NEON
-
-// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
-#ifdef HAS_ARGBTOBAYERROW_NEON
-void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) {
-  asm volatile (
-    "mov        v2.s[0], %w3                   \n"  // selector
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b, v1.16b}, [%0], 32     \n"  // load row 8 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "tbl        v4.8b, {v0.16b}, v2.8b         \n"  // look up 4 pixels
-    "tbl        v5.8b, {v1.16b}, v2.8b         \n"  // look up 4 pixels
-    "trn1       v4.4s, v4.4s, v5.4s            \n"  // combine 8 pixels
-    MEMACCESS(1)
-    "st1        {v4.8b}, [%1], #8              \n"  // store 8.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_bayer),  // %1
-    "+r"(pix)         // %2
-  : "r"(selector)     // %3
-  : "cc", "memory", "v0", "v1", "v2", "v4", "v5"   // Clobber List
-  );
-}
-#endif  // HAS_ARGBTOBAYERROW_NEON
-
 // Select G channels from ARGB.  e.g.  GGGGGGGG
 #ifdef HAS_ARGBTOBAYERGGROW_NEON
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
@@ -1411,11 +1260,11 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
   asm volatile (
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load row 8 pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load row 8 pixels
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
     MEMACCESS(1)
     "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_bayer),  // %1
     "+r"(pix)         // %2
@@ -1439,7 +1288,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
     MEMACCESS(1)
     "st1        {v1.16b}, [%1], #16            \n"  // store 4.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
@@ -1455,19 +1304,18 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_yuy2, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
-    "mov        v2.8b, v1.8b                   \n"
+    "orr        v2.8b, v1.8b, v1.8b            \n"
     MEMACCESS(1)
     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
     MEMACCESS(3)
-    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 YUY2/16 pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
     "+r"(src_v),     // %2
@@ -1485,19 +1333,18 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_uyvy, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld2        {v1.8b, v2.8b}, [%0], #16      \n"  // load 16 Ys
-    "mov        v3.8b, v2.8b                   \n"
+    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+    "orr        v3.8b, v2.8b, v2.8b            \n"
     MEMACCESS(1)
     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
     MEMACCESS(2)
     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
     "subs       %4, %4, #16                    \n"  // 16 pixels
     MEMACCESS(3)
-    "st4        {v0.8b-v3.8b}, [%3], #32       \n"  // Store 8 UYVY/16 pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_y),     // %0
     "+r"(src_u),     // %1
     "+r"(src_v),     // %2
@@ -1512,20 +1359,19 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
 #ifdef HAS_ARGBTORGB565ROW_NEON
 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTORGB565
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_rgb565),  // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
   );
 }
 #endif  // HAS_ARGBTORGB565ROW_NEON
@@ -1534,20 +1380,19 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
                             int pix) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTOARGB1555
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb1555),  // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
   );
 }
 #endif  // HAS_ARGBTOARGB1555ROW_NEON
@@ -1556,21 +1401,20 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
                             int pix) {
   asm volatile (
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align   2                              \n"
+    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGBTOARGB4444
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),      // %0
     "+r"(dst_argb4444),  // %1
     "+r"(pix)            // %2
   :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
   );
 }
 #endif  // HAS_ARGBTOARGB4444ROW_NEON
@@ -1582,10 +1426,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
     "movi       v7.8b, #16                     \n"  // Add 16 constant
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
@@ -1594,7 +1437,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "uqadd      v0.8b, v0.8b, v7.8b            \n"
     MEMACCESS(1)
     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1610,10 +1453,9 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
@@ -1621,7 +1463,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
     MEMACCESS(1)
     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
@@ -1636,41 +1478,41 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
+    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+    "movi       v29.16b,#0x80                  \n"  // 128.5
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlsl.u8   q2, d1, d25                    \n"  // G
-    "vmlsl.u8   q2, d2, d26                    \n"  // R
-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
 
-    "vmull.u8   q3, d2, d24                    \n"  // R
-    "vmlsl.u8   q3, d1, d28                    \n"  // G
-    "vmlsl.u8   q3, d0, d27                    \n"  // B
-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
 
-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
 
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
+    "v24", "v25", "v26", "v27", "v28", "v29"
   );
 }
 #endif  // HAS_ARGBTOUV444ROW_NEON
@@ -1680,49 +1522,41 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
 
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
 
     "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
 
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
 
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
 
     MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUV422ROW_NEON
@@ -1732,128 +1566,108 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(0)
-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
+    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
-    "vpadd.u16  d1, d8, d9                     \n"  // B
-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
-    "vpadd.u16  d3, d10, d11                   \n"  // G
-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
-    "vpadd.u16  d5, d12, d13                   \n"  // R
+    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
+    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
+    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
     "subs       %3, %3, #32                    \n"  // 32 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
+    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
+    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
+    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
+    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
+    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
     MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_u),     // %1
     "+r"(dst_v),     // %2
     "+r"(pix)        // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUV411ROW_NEON
 
 // 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.
 #define RGBTOUV(QB, QG, QR) \
-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
+    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
+    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
+    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
+    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
+    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
+    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
+    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
+    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+// TODO(fbarchard): consider ptrdiff_t for all strides.
+
 #ifdef HAS_ARGBTOUVROW_NEON
 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+
     MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
+    "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUVROW_NEON
@@ -1862,50 +1676,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 #ifdef HAS_ARGBTOUVJROW_NEON
 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
-    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient
-    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient
-    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
-    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
+    "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
+    "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
+    "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
+    "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
+    "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
-    "+r"(src_stride_argb),  // %1
+    "+r"(src_argb_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBTOUVJROW_NEON
@@ -1913,50 +1722,40 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
 #ifdef HAS_BGRATOUVROW_NEON
 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_bgra
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
-    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
-    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
+    "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q1, q1, #1                     \n"  // 2x average
-    "vrshr.u16  q2, q2, #1                     \n"
-    "vrshr.u16  q3, q3, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v3.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q3, q2, q1)
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_bgra),  // %0
-    "+r"(src_stride_bgra),  // %1
+    "+r"(src_bgra_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_BGRATOUVROW_NEON
@@ -1964,50 +1763,40 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
 #ifdef HAS_ABGRTOUVROW_NEON
 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_abgr
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
+    "urshr      v2.8h, v2.8h, #1               \n"
+    "urshr      v1.8h, v1.8h, #1               \n"
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
+    RGBTOUV(v0.8h, v2.8h, v1.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_abgr),  // %0
-    "+r"(src_stride_abgr),  // %1
+    "+r"(src_abgr_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ABGRTOUVROW_NEON
@@ -2015,50 +1804,40 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
 #ifdef HAS_RGBATOUVROW_NEON
 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
                       uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgba
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
-    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    MEMACCESS(1)
-    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
-    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.
+    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
+    "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_rgba),  // %0
-    "+r"(src_stride_rgba),  // %1
+    "+r"(src_rgba_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_RGBATOUVROW_NEON
@@ -2066,50 +1845,40 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
 #ifdef HAS_RGB24TOUVROW_NEON
 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
                        uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
-    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
+    "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v2.8h, v2.8h, #1               \n"
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q0, q1, q2)
+    RGBTOUV(v0.8h, v1.8h, v2.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
-    "+r"(src_stride_rgb24),  // %1
+    "+r"(src_rgb24_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_RGB24TOUVROW_NEON
@@ -2117,50 +1886,40 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
 #ifdef HAS_RAWTOUVROW_NEON
 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
                      uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_raw
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    MEMACCESS(0)
-    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
-    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
+    "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
+    "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
     MEMACCESS(1)
-    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
-    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.
+    "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
+    "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
+    "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
+    "urshr      v1.8h, v1.8h, #1               \n"
+    "urshr      v0.8h, v0.8h, #1               \n"
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
-    RGBTOUV(q2, q1, q0)
+    RGBTOUV(v2.8h, v1.8h, v0.8h)
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_raw),  // %0
-    "+r"(src_stride_raw),  // %1
+    "+r"(src_raw_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_RAWTOUVROW_NEON
@@ -2169,70 +1928,74 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 #ifdef HAS_RGB565TOUVROW_NEON
 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
                         uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
+    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
     RGB565TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+    "ins        v16.D[1], v17.D[0]             \n"
+    "ins        v18.D[1], v19.D[0]             \n"
+    "ins        v20.D[1], v21.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v18.8h, #1              \n"
+    "urshr      v6.8h, v20.8h, #1              \n"
 
     "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb565),  // %0
-    "+r"(src_stride_rgb565),  // %1
+    "+r"(src_rgb565_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
+    "v25", "v26", "v27"
   );
 }
 #endif  // HAS_RGB565TOUVROW_NEON
@@ -2241,70 +2004,69 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
 #ifdef HAS_ARGB1555TOUVROW_NEON
 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
                         uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
     RGB555TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
 
     "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb1555),  // %0
-    "+r"(src_stride_argb1555),  // %1
+    "+r"(src_argb1555_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
   );
 }
 #endif  // HAS_ARGB1555TOUVROW_NEON
@@ -2313,70 +2075,70 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
 #ifdef HAS_ARGB4444TOUVROW_NEON
 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
                           uint8* dst_u, uint8* dst_v, int pix) {
+  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
   asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align   2                              \n"
+    RGBTOUV_SETUP_REG
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
     ARGB4444TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
 
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
+    "ins        v16.D[1], v26.D[0]             \n"
+    "ins        v17.D[1], v27.D[0]             \n"
+    "ins        v18.D[1], v28.D[0]             \n"
+
+    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+    "urshr      v5.8h, v17.8h, #1              \n"
+    "urshr      v6.8h, v18.8h, #1              \n"
 
     "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
     MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
     MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
+    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+    "b.gt       1b                             \n"
   : "+r"(src_argb4444),  // %0
-    "+r"(src_stride_argb4444),  // %1
+    "+r"(src_argb4444_1),  // %1
     "+r"(dst_u),     // %2
     "+r"(dst_v),     // %3
     "+r"(pix)        // %4
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
+    "v26", "v27", "v28"
+
   );
 }
 #endif  // HAS_ARGB4444TOUVROW_NEON
@@ -2384,29 +2146,29 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
 #ifdef HAS_RGB565TOYROW_NEON
 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     RGB565TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb565),  // %0
     "+r"(dst_y),       // %1
     "+r"(pix)          // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
+    "v24", "v25", "v26", "v27"
   );
 }
 #endif  // HAS_RGB565TOYROW_NEON
@@ -2414,29 +2176,28 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
 #ifdef HAS_ARGB1555TOYROW_NEON
 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB1555TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_argb1555),  // %0
     "+r"(dst_y),         // %1
     "+r"(pix)            // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGB1555TOYROW_NEON
@@ -2444,29 +2205,28 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
 #ifdef HAS_ARGB4444TOYROW_NEON
 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+    "movi       v27.8b, #16                    \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     ARGB4444TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v27.8b           \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_argb4444),  // %0
     "+r"(dst_y),         // %1
     "+r"(pix)            // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
   );
 }
 #endif  // HAS_ARGB4444TOYROW_NEON
@@ -2474,28 +2234,27 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
 #ifdef HAS_BGRATOYROW_NEON
 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // R
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_BGRATOYROW_NEON
@@ -2503,28 +2262,27 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
 #ifdef HAS_ABGRTOYROW_NEON
 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // R
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_abgr),  // %0
-    "+r"(dst_y),  // %1
+    "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_ABGRTOYROW_NEON
@@ -2532,28 +2290,27 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
 #ifdef HAS_RGBATOYROW_NEON
 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // B
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_rgba),  // %0
-    "+r"(dst_y),  // %1
+    "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_RGBATOYROW_NEON
@@ -2561,28 +2318,27 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
 #ifdef HAS_RGB24TOYROW_NEON
 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
+    "+r"(dst_y),      // %1
+    "+r"(pix)         // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_RGB24TOYROW_NEON
@@ -2590,28 +2346,27 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
 #ifdef HAS_RAWTOYROW_NEON
 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
   asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align   2                              \n"
+    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+    "movi       v7.8b, #16                     \n"  // Add 16 constant
   "1:                                          \n"
     MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
+    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+    "uqadd      v0.8b, v0.8b, v7.8b            \n"
+    MEMACCESS(1)
+    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+    "b.gt       1b                             \n"
   : "+r"(src_raw),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(pix)        // %2
+    "+r"(dst_y),    // %1
+    "+r"(pix)       // %2
   :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
 #endif  // HAS_RAWTOYROW_NEON
@@ -2621,96 +2376,98 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
 void InterpolateRow_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction) {
+  int y1_fraction = source_y_fraction;
+  int y0_fraction = 256 - y1_fraction;
+  const uint8* src_ptr1 = src_ptr + src_stride;
   asm volatile (
     "cmp        %4, #0                         \n"
-    "beq        100f                           \n"
-    "add        %2, %1                         \n"
+    "b.eq       100f                           \n"
     "cmp        %4, #64                        \n"
-    "beq        75f                            \n"
+    "b.eq       75f                            \n"
     "cmp        %4, #128                       \n"
-    "beq        50f                            \n"
+    "b.eq       50f                            \n"
     "cmp        %4, #192                       \n"
-    "beq        25f                            \n"
+    "b.eq       25f                            \n"
 
-    "vdup.8     d5, %4                         \n"
-    "rsb        %4, #256                       \n"
-    "vdup.8     d4, %4                         \n"
+    "dup        v5.16b, %w4                    \n"
+    "dup        v4.16b, %w5                    \n"
     // General purpose row blend.
   "1:                                          \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
+    "ld1        {v1.16b}, [%2], #16            \n"
     "subs       %3, %3, #16                    \n"
-    "vmull.u8   q13, d0, d4                    \n"
-    "vmull.u8   q14, d1, d4                    \n"
-    "vmlal.u8   q13, d2, d5                    \n"
-    "vmlal.u8   q14, d3, d5                    \n"
-    "vrshrn.u16 d0, q13, #8                    \n"
-    "vrshrn.u16 d1, q14, #8                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        1b                             \n"
+    "umull      v2.8h, v0.8b,  v4.8b           \n"
+    "umull2     v3.8h, v0.16b, v4.16b          \n"
+    "umlal      v2.8h, v1.8b,  v5.8b           \n"
+    "umlal2     v3.8h, v1.16b, v5.16b          \n"
+    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #8              \n"
+    MEMACCESS(0)
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       1b                             \n"
     "b          99f                            \n"
 
     // Blend 25 / 75.
   "25:                                         \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
+    "ld1        {v1.16b}, [%2], #16            \n"
     "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    "vrhadd.u8  q0, q1                         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        25b                            \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       25b                            \n"
     "b          99f                            \n"
 
     // Blend 50 / 50.
   "50:                                         \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
+    "ld1        {v1.16b}, [%2], #16            \n"
     "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        50b                            \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       50b                            \n"
     "b          99f                            \n"
 
     // Blend 75 / 25.
   "75:                                         \n"
     MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"
+    "ld1        {v1.16b}, [%1], #16            \n"
     MEMACCESS(2)
-    "vld1.8     {q0}, [%2]!                    \n"
+    "ld1        {v0.16b}, [%2], #16            \n"
     "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    "vrhadd.u8  q0, q1                         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
+    "urhadd     v0.16b, v0.16b, v1.16b         \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        75b                            \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       75b                            \n"
     "b          99f                            \n"
 
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
     MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
+    "ld1        {v0.16b}, [%1], #16            \n"
     "subs       %3, %3, #16                    \n"
     MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        100b                           \n"
+    "st1        {v0.16b}, [%0], #16            \n"
+    "b.gt       100b                           \n"
 
   "99:                                         \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
+    "+r"(src_ptr1),         // %2
     "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
+    "+r"(y1_fraction),      // %4
+    "+r"(y0_fraction)       // %5
   :
-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
   );
 }
 #endif  // HAS_INTERPOLATEROW_NEON
@@ -2720,55 +2477,59 @@ void InterpolateRow_NEON(uint8* dst_ptr,
 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                        uint8* dst_argb, int width) {
   asm volatile (
-    "subs       %3, #8                         \n"
-    "blt        89f                            \n"
+    "subs       %3, %3, #8                     \n"
+    "b.lt       89f                            \n"
     // Blend 8 pixels.
   "8:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
     MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
     MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-    "bge        8b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.ge       8b                             \n"
 
   "89:                                         \n"
-    "adds       %3, #8-1                       \n"
-    "blt        99f                            \n"
+    "adds       %3, %3, #8-1                   \n"
+    "b.lt       99f                            \n"
 
     // Blend 1 pixels.
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
     MEMACCESS(1)
-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
     "subs       %3, %3, #1                     \n"  // 1 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
+    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+    "movi       v3.8b, #255                    \n"  // a = 255
     MEMACCESS(2)
-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-    "bge        1b                             \n"
+    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+    "b.ge       1b                             \n"
 
   "99:                                         \n"
 
@@ -2777,7 +2538,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     "+r"(dst_argb),     // %2
     "+r"(width)         // %3
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v16", "v17", "v18"
   );
 }
 #endif  // HAS_ARGBBLENDROW_NEON
@@ -2789,22 +2551,22 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
     // Attenuate 8 pixels.
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d0, d3                    \n"  // b * a
-    "vmull.u8   q11, d1, d3                    \n"  // g * a
-    "vmull.u8   q12, d2, d3                    \n"  // r * a
-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
     "+r"(width)       // %2
   :
-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   );
 }
 #endif  // HAS_ARGBATTENUATEROW_NEON
@@ -2815,41 +2577,40 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   asm volatile (
-    "vdup.u16   q8, %2                         \n"
-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-    "vdup.u16   q9, %3                         \n"  // interval multiply.
-    "vdup.u16   q10, %4                        \n"  // interval add
+    "dup        v4.8h, %w2                     \n"
+    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+    "dup        v5.8h, %w3                     \n"  // interval multiply.
+    "dup        v6.8h, %w4                     \n"  // interval add
 
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
     "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-    "vmovl.u8   q1, d2                         \n"
-    "vmovl.u8   q2, d4                         \n"
-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-    "vqdmulh.s16 q1, q1, q8                    \n"  // g
-    "vqdmulh.s16 q2, q2, q8                    \n"  // r
-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-    "vmul.u16   q1, q1, q9                     \n"  // g
-    "vmul.u16   q2, q2, q9                     \n"  // r
-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-    "vadd.u16   q1, q1, q10                    \n"  // g
-    "vadd.u16   q2, q2, q10                    \n"  // r
-    "vqmovn.u16 d0, q0                         \n"
-    "vqmovn.u16 d2, q1                         \n"
-    "vqmovn.u16 d4, q2                         \n"
-    MEMACCESS(0)
-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
+    "uxtl       v1.8h, v1.8b                   \n"
+    "uxtl       v2.8h, v2.8b                   \n"
+    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+    "add        v1.8h, v1.8h, v6.8h            \n"  // g
+    "add        v2.8h, v2.8h, v6.8h            \n"  // r
+    "uqxtn      v0.8b, v0.8h                   \n"
+    "uqxtn      v1.8b, v1.8h                   \n"
+    "uqxtn      v2.8b, v2.8h                   \n"
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(dst_argb),       // %0
     "+r"(width)           // %1
   : "r"(scale),           // %2
     "r"(interval_size),   // %3
     "r"(interval_offset)  // %4
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
   );
 }
 #endif  // HAS_ARGBQUANTIZEROW_NEON
@@ -2861,36 +2622,35 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   asm volatile (
-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
 
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-    "vmovl.u8   q11, d22                       \n"
-    "vmovl.u8   q12, d24                       \n"
-    "vmovl.u8   q13, d26                       \n"
-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-    "vqmovn.u16 d20, q10                       \n"
-    "vqmovn.u16 d22, q11                       \n"
-    "vqmovn.u16 d24, q12                       \n"
-    "vqmovn.u16 d26, q13                       \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
+    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+    "uxtl       v5.8h, v5.8b                   \n"
+    "uxtl       v6.8h, v6.8b                   \n"
+    "uxtl       v7.8h, v7.8b                   \n"
+    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+    "uqxtn      v4.8b, v4.8h                   \n"
+    "uqxtn      v5.8b, v5.8h                   \n"
+    "uqxtn      v6.8b, v6.8h                   \n"
+    "uqxtn      v7.8b, v7.8h                   \n"
+    MEMACCESS(1)
+    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_argb),       // %0
     "+r"(dst_argb),       // %1
     "+r"(width)           // %2
   : "r"(value)            // %3
-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
   );
 }
 #endif  // HAS_ARGBSHADEROW_NEON
@@ -2901,28 +2661,27 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
 #ifdef HAS_ARGBGRAYROW_NEON
 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-    ".p2align   2                              \n"
+    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-    "vmov       d1, d0                         \n"  // G
-    "vmov       d2, d0                         \n"  // R
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+    MEMACCESS(1)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
   );
 }
 #endif  // HAS_ARGBGRAYROW_NEON
@@ -2935,40 +2694,39 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
 #ifdef HAS_ARGBSEPIAROW_NEON
 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
   asm volatile (
-    "vmov.u8    d20, #17                       \n"  // BB coefficient
-    "vmov.u8    d21, #68                       \n"  // BG coefficient
-    "vmov.u8    d22, #35                       \n"  // BR coefficient
-    "vmov.u8    d24, #22                       \n"  // GB coefficient
-    "vmov.u8    d25, #88                       \n"  // GG coefficient
-    "vmov.u8    d26, #45                       \n"  // GR coefficient
-    "vmov.u8    d28, #24                       \n"  // BB coefficient
-    "vmov.u8    d29, #98                       \n"  // BG coefficient
-    "vmov.u8    d30, #50                       \n"  // BR coefficient
-    ".p2align   2                              \n"
+    "movi       v20.8b, #17                    \n"  // BB coefficient
+    "movi       v21.8b, #68                    \n"  // BG coefficient
+    "movi       v22.8b, #35                    \n"  // BR coefficient
+    "movi       v24.8b, #22                    \n"  // GB coefficient
+    "movi       v25.8b, #88                    \n"  // GG coefficient
+    "movi       v26.8b, #45                    \n"  // GR coefficient
+    "movi       v28.8b, #24                    \n"  // BB coefficient
+    "movi       v29.8b, #98                    \n"  // BG coefficient
+    "movi       v30.8b, #50                    \n"  // BR coefficient
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
     "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-    "vmlal.u8   q2, d1, d21                    \n"  // G
-    "vmlal.u8   q2, d2, d22                    \n"  // R
-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-    "vmlal.u8   q3, d1, d25                    \n"  // G
-    "vmlal.u8   q3, d2, d26                    \n"  // R
-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-    "vmlal.u8   q8, d1, d29                    \n"  // G
-    "vmlal.u8   q8, d2, d30                    \n"  // R
-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
+    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
+    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
+    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
+    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
+    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
+    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
+    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
+    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
+    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
+    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
+    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
+    MEMACCESS(0)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
   : "+r"(dst_argb),  // %0
     "+r"(width)      // %1
   :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
   );
 }
 #endif  // HAS_ARGBSEPIAROW_NEON
@@ -2981,60 +2739,59 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
                              const int8* matrix_argb, int width) {
   asm volatile (
     MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
 
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-    "vmovl.u8   q9, d18                        \n"  // g
-    "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q15, d22                       \n"  // a
-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+    "uxtl       v17.8h, v17.8b                 \n"  // g
+    "uxtl       v18.8h, v18.8b                 \n"  // r
+    "uxtl       v19.8h, v19.8b                 \n"  // a
+    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+    MEMACCESS(1)
+    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
+    "b.gt       1b                             \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
     "+r"(width)       // %2
   : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
+  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v22", "v23", "v24", "v25"
   );
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_NEON
@@ -3046,12 +2803,11 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
@@ -3062,8 +2818,8 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
     MEMACCESS(2)
-    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
 
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -3081,20 +2837,19 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "uqadd      v0.8b, v0.8b, v4.8b            \n"
     "uqadd      v1.8b, v1.8b, v5.8b            \n"
     "uqadd      v2.8b, v2.8b, v6.8b            \n"
     "uqadd      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
 
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -3112,20 +2867,19 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "ld4        {v0.8b-v3.8b}, [%0], #32       \n"  // load 8 ARGB pixels.
+    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
     MEMACCESS(1)
-    "ld4        {v4.8b-v7.8b}, [%1], #32       \n"  // load 8 more ARGB pixels.
+    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "uqsub      v0.8b, v0.8b, v4.8b            \n"
     "uqsub      v1.8b, v1.8b, v5.8b            \n"
     "uqsub      v2.8b, v2.8b, v6.8b            \n"
     "uqsub      v3.8b, v3.8b, v7.8b            \n"
     MEMACCESS(2)
-    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
 
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -3148,7 +2902,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
   asm volatile (
     "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
@@ -3156,11 +2909,11 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
-    "mov        v1.8b, v0.8b                   \n"
-    "mov        v2.8b, v0.8b                   \n"
+    "orr        v1.8b, v0.8b, v0.8b            \n"
+    "orr        v2.8b, v0.8b, v0.8b            \n"
     MEMACCESS(2)
-    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
@@ -3177,7 +2930,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                           uint8* dst_y, int width) {
   asm volatile (
     // 16 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
@@ -3187,7 +2939,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
     MEMACCESS(2)
     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_y),       // %2
@@ -3209,7 +2961,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
   asm volatile (
     "movi       v3.8b, #255                    \n"  // alpha
     // 8 pixel loop.
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
@@ -3218,8 +2969,8 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
     MEMACCESS(2)
-    "st4        {v0.8b-v3.8b}, [%2], #32       \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
+    "b.gt       1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
@@ -3238,7 +2989,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v0.8b}, [%0],%5               \n"  // top
@@ -3263,7 +3013,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
     "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(3)
     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
     "+r"(src_y2),      // %2
@@ -3284,7 +3034,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
     "ld1        {v0.8b}, [%0],%4               \n"  // left
@@ -3309,7 +3058,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
     "uqxtn      v0.8b, v0.8h                   \n"
     MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
-    "bgt        1b                             \n"
+    "b.gt       1b                             \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
     "+r"(dst_sobely),  // %2
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc
index 106fda56891..1a6f7dc4dd0 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_posix.cc
@@ -1,3 +1,4 @@
+// VERSION 2
 /*
  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
@@ -92,6 +93,7 @@ static uvec8 kAddY16 = {
   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
+// 7 bit fixed point 0.5.
 static vec16 kAddYJ64 = {
   64, 64, 64, 64, 64, 64, 64, 64
 };
@@ -221,7 +223,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   "1:                                          \n"
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -229,10 +231,7 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
 #endif  // TESTING
@@ -252,37 +251,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
     "punpckhwd %%xmm1,%%xmm1                   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "por       %%xmm5,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
-                                  int pix) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
@@ -291,11 +259,7 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
   : "+r"(src_y),     // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
 #endif  // HAS_I400TOARGBROW_SSE2
@@ -318,27 +282,24 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     "por       %%xmm5,%%xmm2                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "palignr   $0x4,%%xmm3,%%xmm3              \n"
     "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -359,27 +320,24 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
     "por       %%xmm5,%%xmm2                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "palignr   $0x4,%%xmm3,%%xmm3              \n"
     "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -417,9 +375,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpckhbw %%xmm0,%%xmm2                   \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -427,13 +384,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   :
-  : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -474,9 +426,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpckhbw %%xmm0,%%xmm2                   \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -484,13 +435,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   :
-  : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -518,9 +464,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)
-    MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -528,13 +473,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   :
-  : "memory", "cc", "eax"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "eax", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -572,10 +512,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -613,10 +550,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
     "+r"(dst),  // %1
     "+r"(pix)   // %2
   : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -631,7 +565,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0xb,%%xmm5                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "pslld     $0x8,%%xmm0                     \n"
@@ -652,11 +586,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -672,7 +602,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0xf,%%xmm7                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm3                   \n"
@@ -690,17 +620,14 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "packssdw  %%xmm0,%%xmm0                   \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMACCESS2(0x8,1) ",%1        \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  :: "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -712,7 +639,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "psrlw     $0x8,%%xmm3                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pand      %%xmm3,%%xmm0                   \n"
     "pand      %%xmm4,%%xmm1                   \n"
@@ -728,57 +655,17 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(pix)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   );
 }
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
 #ifdef HAS_ARGBTOYROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  asm volatile (
     "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
@@ -796,34 +683,33 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kARGBToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 #ifdef HAS_ARGBTOYJROW_SSSE3
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
@@ -836,158 +722,131 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kARGBToYJ),  // %3
     "m"(kAddYJ64)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
+#endif  // HAS_ARGBTOYJROW_SSSE3
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
 
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
-#endif  // HAS_ARGBTOYJROW_SSSE3
+#endif  // HAS_ARGBTOYROW_AVX2
 
-#ifdef HAS_ARGBTOUVROW_SSSE3
-// TODO(fbarchard): pass xmm constants to single block of assembly.
-// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
-// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
-// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
-// and considered unsafe.
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
+#ifdef HAS_ARGBTOYJROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "sub       %1,%2                           \n"
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vbroadcastf128 %4,%%ymm5                  \n"
+    "vmovdqu    %5,%%ymm6                      \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_y),     // %1
+    "+r"(pix)        // %2
+  : "m"(kARGBToYJ),   // %3
+    "m"(kAddYJ64),    // %4
+    "m"(kPermdARGBToY_AVX)  // %5
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
+#endif  // HAS_ARGBTOYJROW_AVX2
 
-// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToUJ),  // %0
-    "m"(kARGBToVJ),  // %1
-    "m"(kAddUVJ128)  // %2
-  );
+#ifdef HAS_ARGBTOUVROW_SSSE3
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm0                   \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm1                   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm2                   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -1005,130 +864,118 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "pmaddubsw %%xmm3,%%xmm6                   \n"
     "phaddw    %%xmm2,%%xmm0                   \n"
     "phaddw    %%xmm6,%%xmm1                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm1                   \n"
     "psraw     $0x8,%%xmm0                     \n"
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToV),  // %5
+    "m"(kARGBToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
+#endif  // HAS_ARGBTOUVROW_SSSE3
 
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),         // %0
-    "m"(kARGBToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
+#ifdef HAS_ARGBTOUVROW_AVX2
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+};
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+                      uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
+    "vbroadcastf128 %5,%%ymm5                  \n"
+    "vbroadcastf128 %6,%%ymm6                  \n"
+    "vbroadcastf128 %7,%%ymm7                  \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
+    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
+    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
+    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
+    "lea       " MEMLEA(0x80,0) ",%0           \n"
+    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpshufb    %8,%%ymm0,%%ymm0               \n"
+    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x20,%3                        \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kAddUV128),  // %5
+    "m"(kARGBToV),   // %6
+    "m"(kARGBToU),   // %7
+    "m"(kShufARGBToUV_AVX)  // %8
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
+#endif  // HAS_ARGBTOUVROW_AVX2
 
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                  uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToUJ),         // %0
-    "m"(kARGBToVJ),         // %1
-    "m"(kAddUVJ128)         // %2
-  );
+#ifdef HAS_ARGBTOUVJROW_SSSE3
+// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -1151,104 +998,32 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "psraw     $0x8,%%xmm0                     \n"
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb))
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_argb)), // %4
+    "m"(kARGBToVJ),  // %5
+    "m"(kARGBToUJ),  // %6
+    "m"(kAddUVJ128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
+#endif  // HAS_ARGBTOUVJROW_SSSE3
 
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                           int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
-  );
-}
-
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
-                                    uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
@@ -1266,7 +1041,6 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
     "psraw     $0x8,%%xmm2                     \n"
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
@@ -1283,98 +1057,30 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "lea       " MEMLEA(0x40,0) ",%0           \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb),        // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6"
-#endif
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6"
   );
 }
+#endif  // HAS_ARGBTOUV444ROW_SSSE3
 
+#ifdef HAS_ARGBTOUV422ROW_SSSE3
 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
-                                    uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kARGBToU),  // %0
-    "m"(kARGBToV),  // %1
-    "m"(kAddUV128)  // %2
-  );
-  asm volatile (
+    "movdqa    %4,%%xmm3                       \n"
+    "movdqa    %5,%%xmm4                       \n"
+    "movdqa    %6,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
@@ -1403,26 +1109,23 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "m"(kARGBToV),  // %4
+    "m"(kARGBToU),  // %5
+    "m"(kAddUV128)  // %6
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
+#endif  // HAS_ARGBTOUV422ROW_SSSE3
 
 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
@@ -1430,43 +1133,6 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1482,116 +1148,41 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kBGRAToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kBGRAToU),         // %0
-    "m"(kBGRAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kBGRAToU),         // %0
-    "m"(kBGRAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -1613,24 +1204,21 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_bgra0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_bgra)), // %4
+    "m"(kBGRAToV),  // %5
+    "m"(kBGRAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
 
@@ -1640,43 +1228,6 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1692,19 +1243,16 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_abgr),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kABGRToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -1714,43 +1262,6 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
     "movdqa    %3,%%xmm4                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(pix)        // %2
-  : "m"(kRGBAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
@@ -1766,116 +1277,41 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_rgba),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   : "m"(kRGBAToY),   // %3
     "m"(kAddY16)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kABGRToU),         // %0
-    "m"(kABGRToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kABGRToU),         // %0
-    "m"(kABGRToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -1897,121 +1333,46 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_abgr0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_abgr)), // %4
+    "m"(kABGRToV),  // %5
+    "m"(kABGRToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
 
 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
                        uint8* dst_u, uint8* dst_v, int width) {
   asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kRGBAToU),         // %0
-    "m"(kRGBAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
-    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
-    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
-    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba))
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
-  );
-}
-
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kRGBAToU),         // %0
-    "m"(kRGBAToV),         // %1
-    "m"(kAddUV128)         // %2
-  );
-  asm volatile (
+    "movdqa    %5,%%xmm3                       \n"
+    "movdqa    %6,%%xmm4                       \n"
+    "movdqa    %7,%%xmm5                       \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
+
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
@@ -2033,75 +1394,83 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
     "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_rgba0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba)) // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-#endif
+  : "r"((intptr_t)(src_stride_rgba)), // %4
+    "m"(kRGBAToV),  // %5
+    "m"(kRGBAToU),  // %6
+    "m"(kAddUV128)  // %7
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
   );
 }
-#endif  // HAS_ARGBTOUVROW_SSSE3
 
-#ifdef HAS_I422TOARGBROW_SSSE3
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-struct {
-  vec8 kUVToB;  // 0
-  vec8 kUVToG;  // 16
-  vec8 kUVToR;  // 32
-  vec16 kUVBiasB;  // 48
-  vec16 kUVBiasG;  // 64
-  vec16 kUVBiasR;  // 80
-  vec16 kYSub16;  // 96
-  vec16 kYToRgb;  // 112
-  vec8 kVUToB;  // 128
-  vec8 kVUToG;  // 144
-  vec8 kVUToR;  // 160
-} static SIMD_ALIGNED(kYuvConstants) = {
-  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR },
-  { 16, 16, 16, 16, 16, 16, 16, 16 },
-  { YG, YG, YG, YG, YG, YG, YG, YG },
-  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
+#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
+
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
+};
+
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 };
 
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
+};
 
 // Read 8 UV from 411
 #define READYUV444                                                             \
     "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    BUNDLEALIGN                                                                \
     MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"
@@ -2109,7 +1478,6 @@ struct {
 // Read 4 UV from 422, upsample to 8 UV
 #define READYUV422                                                             \
     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    BUNDLEALIGN                                                                \
     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
@@ -2118,7 +1486,6 @@ struct {
 // Read 2 UV from 411, upsample to 8 UV
 #define READYUV411                                                             \
     "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    BUNDLEALIGN                                                                \
     MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
     "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
     "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
@@ -2132,20 +1499,23 @@ struct {
     "punpcklwd  %%xmm0,%%xmm0                                   \n"
 
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB                                                               \
+#define YUVTORGB(YuvConstants)                                                 \
     "movdqa     %%xmm0,%%xmm1                                   \n"            \
     "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \
-    "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \
-    "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \
-    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
-    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
-    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
+    "movdqa     %%xmm0,%%xmm3                                   \n"            \
+    "movdqa     " MEMACCESS2(96, [YuvConstants]) ",%%xmm0       \n"            \
+    "pmaddubsw  " MEMACCESS([YuvConstants]) ",%%xmm1            \n"            \
+    "psubw      %%xmm1,%%xmm0                                   \n"            \
+    "movdqa     " MEMACCESS2(128, [YuvConstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%xmm2       \n"            \
+    "psubw      %%xmm2,%%xmm1                                   \n"            \
+    "movdqa     " MEMACCESS2(160, [YuvConstants]) ",%%xmm2      \n"            \
+    "pmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%xmm3       \n"            \
+    "psubw      %%xmm3,%%xmm2                                   \n"            \
     "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
     "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
+    "punpcklbw  %%xmm3,%%xmm3                                   \n"            \
+    "pmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%xmm3      \n"            \
     "paddsw     %%xmm3,%%xmm0                                   \n"            \
     "paddsw     %%xmm3,%%xmm1                                   \n"            \
     "paddsw     %%xmm3,%%xmm2                                   \n"            \
@@ -2156,30 +1526,51 @@ struct {
     "packuswb   %%xmm1,%%xmm1                                   \n"            \
     "packuswb   %%xmm2,%%xmm2                                   \n"
 
-// Convert 8 pixels: 8 VU and 8 Y
-#define YVUTORGB                                                               \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
-    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
-    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
-    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
-    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
-    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
-    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
-    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
-    "paddsw     %%xmm3,%%xmm0                                   \n"            \
-    "paddsw     %%xmm3,%%xmm1                                   \n"            \
-    "paddsw     %%xmm3,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+// Store 8 ARGB values. Assumes XMM5 is zero.
+#define STOREARGB                                                              \
+    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
+    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
+    "movdqa     %%xmm0,%%xmm1                                    \n"           \
+    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
+    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
+    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10,[dst_argb]) "           \n"           \
+    "lea        " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]          \n"
+
+// Store 8 BGRA values. Assumes XMM5 is zero.
+#define STOREBGRA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "            \n"           \
+    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra]           \n"
+
+// Store 8 ABGR values. Assumes XMM5 is zero.
+#define STOREABGR                                                              \
+    "punpcklbw %%xmm1,%%xmm2                                     \n"           \
+    "punpcklbw %%xmm5,%%xmm0                                     \n"           \
+    "movdqa    %%xmm2,%%xmm1                                     \n"           \
+    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
+    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
+    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "            \n"           \
+    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr]           \n"
+
+// Store 8 RGBA values. Assumes XMM5 is zero.
+#define STORERGBA                                                              \
+    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
+    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
+    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
+    "movdqa    %%xmm5,%%xmm0                                     \n"           \
+    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
+    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "            \n"           \
+    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba]           \n"
 
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                 const uint8* u_buf,
@@ -2189,19 +1580,11 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV444
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2210,41 +1593,25 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
+// TODO(fbarchard): Consider putting masks into constants.
 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
                                  const uint8* u_buf,
                                  const uint8* v_buf,
                                  uint8* dst_rgb24,
                                  int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
-  asm volatile (
-    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-  :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
-#endif
-
   asm volatile (
-#if !defined(__i386__)
     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
-#endif
     "sub       %[u_buf],%[v_buf]               \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -2256,25 +1623,23 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
     "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
-    "sub       $0x8,%[width]                   \n"
+    "subl      $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)     // %[width]
+#else
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
-    , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
-    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-#endif
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 #endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
   );
 }
 
@@ -2283,26 +1648,14 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
                                const uint8* v_buf,
                                uint8* dst_raw,
                                int width) {
-// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
-#if defined(__i386__)
-  asm volatile (
-    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
-    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
-  :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
-    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
-#endif
-
   asm volatile (
-#if !defined(__i386__)
     "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
     "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
-#endif
     "sub       %[u_buf],%[v_buf]               \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
     "punpcklbw %%xmm1,%%xmm0                   \n"
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -2314,25 +1667,23 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
     "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
     "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
     "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
-    "sub       $0x8,%[width]                   \n"
+    "subl      $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_raw]"+r"(dst_raw),  // %[dst_raw]
+// TODO(fbarchard): Make width a register for 32 bit.
+#if defined(__i386__) && defined(__pic__)
+    [width]"+m"(width)    // %[width]
+#else
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
-#if !defined(__i386__)
-    , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
-    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
-#endif
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 #endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB),
+    [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6"
   );
 }
 
@@ -2344,19 +1695,11 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2365,13 +1708,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2383,19 +1721,11 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV411
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2404,13 +1734,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2420,19 +1745,11 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
                                 int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READNV12
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2440,11 +1757,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
   // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2454,216 +1768,20 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
                                 int width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV12
-    YVUTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-  // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [u_buf]"+r"(u_buf),    // %[u_buf]
-    [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* uv_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READNV12
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
+    YUVTORGB(kYuvConstants)
+    STOREARGB
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [uv_buf]"+r"(uv_buf),    // %[uv_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-  // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* uv_buf,
-                                          uint8* dst_argb,
-                                          int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    READNV12
-    YVUTORGB
-    "punpcklbw %%xmm1,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm0                   \n"
-    "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x8,%[width]                   \n"
-    "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),    // %[y_buf]
-    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
+  : [kYuvConstants]"r"(&kYvuConstants.kUVToB) // %[kYuvConstants]
   // Does not use r14.
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2675,20 +1793,11 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"
-    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2697,13 +1806,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2715,19 +1819,11 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm2                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
+    YUVTORGB(kYuvConstants)
+    STOREABGR
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2736,13 +1832,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
     [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -2754,20 +1845,11 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
     LABELALIGN
   "1:                                          \n"
     READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm2,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"
-    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
+    YUVTORGB(kYuvConstants)
+    STORERGBA
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2776,159 +1858,233 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
-void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_bgra,
-                                          int width) {
+#endif  // HAS_I422TOARGBROW_SSSE3
+
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2                                                        \
+    "vmovq       " MEMACCESS([u_buf]) ",%%xmm0                      \n"        \
+    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
+    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
+    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants)                                            \
+    "vpmaddubsw  " MEMACCESS2(64, [YuvConstants]) ",%%ymm0,%%ymm2   \n"        \
+    "vpmaddubsw  " MEMACCESS2(32, [YuvConstants]) ",%%ymm0,%%ymm1   \n"        \
+    "vpmaddubsw  " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0        \n"        \
+    "vmovdqu     " MEMACCESS2(160, [YuvConstants]) ",%%ymm3         \n"        \
+    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
+    "vmovdqu     " MEMACCESS2(128, [YuvConstants]) ",%%ymm2         \n"        \
+    "vpsubw      %%ymm1,%%ymm2,%%ymm1                               \n"        \
+    "vmovdqu     " MEMACCESS2(96, [YuvConstants]) ",%%ymm1          \n"        \
+    "vpsubw      %%ymm0,%%ymm1,%%ymm0                               \n"        \
+    "vmovdqu     " MEMACCESS([y_buf]) ",%%xmm3                      \n"        \
+    "lea         " MEMLEA(0x10, [y_buf]) ",%[y_buf]                 \n"        \
+    "vpermq      $0xd8,%%ymm3,%%ymm3                                \n"        \
+    "vpunpcklbw  %%ymm3,%%ymm3,%%ymm3                               \n"        \
+    "vpmulhuw    " MEMACCESS2(192, [YuvConstants]) ",%%ymm3,%%ymm3  \n"        \
+    "vpaddsw     %%ymm3,%%ymm0,%%ymm0           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm1,%%ymm1           \n"                            \
+    "vpaddsw     %%ymm3,%%ymm2,%%ymm2           \n"                            \
+    "vpsraw      $0x6,%%ymm0,%%ymm0             \n"                            \
+    "vpsraw      $0x6,%%ymm1,%%ymm1             \n"                            \
+    "vpsraw      $0x6,%%ymm2,%%ymm2             \n"                            \
+    "vpackuswb   %%ymm0,%%ymm0,%%ymm0           \n"                            \
+    "vpackuswb   %%ymm1,%%ymm1,%%ymm1           \n"                            \
+    "vpackuswb   %%ymm2,%%ymm2,%%ymm2           \n"
+
+#if defined(HAS_I422TOBGRAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_bgra,
+                               int width) {
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
     LABELALIGN
   "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
-    "sub       $0x8,%[width]                   \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into BGRA
+    "vpunpcklbw %%ymm0,%%ymm1,%%ymm1           \n"  // GB
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm2,%%ymm5,%%ymm2           \n"  // AR
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"  // ARGB first 8 pixels
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm2           \n"  // ARGB next 8 pixels
+
+    "vmovdqu    %%ymm0," MEMACCESS([dst_bgra]) "\n"
+    "vmovdqu    %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n"
+    "lea       " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n"
+    "sub       $0x10,%[width]                  \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
+#endif  // HAS_I422TOBGRAROW_AVX2
 
-void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_abgr,
-                                          int width) {
+#if defined(HAS_I422TOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
     LABELALIGN
   "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm2                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
-    "sub       $0x8,%[width]                   \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ARGB
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"  // BG
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2           \n"  // RA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1           \n"  // BGRA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0           \n"  // BGRA next 8 pixels
+
+    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
+#endif  // HAS_I422TOARGBROW_AVX2
 
-void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
-                                          const uint8* u_buf,
-                                          const uint8* v_buf,
-                                          uint8* dst_rgba,
-                                          int width) {
+#if defined(HAS_I422TOABGRROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
   asm volatile (
     "sub       %[u_buf],%[v_buf]               \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
     LABELALIGN
   "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm2,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm5,%%xmm0                   \n"
-    "punpcklwd %%xmm1,%%xmm5                   \n"
-    "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
-    "sub       $0x8,%[width]                   \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    "vpunpcklbw %%ymm1,%%ymm2,%%ymm1           \n"  // RG
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm5,%%ymm0,%%ymm2           \n"  // BA
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm2,%%ymm1,%%ymm0           \n"  // RGBA first 8 pixels
+    "vpunpckhwd %%ymm2,%%ymm1,%%ymm1           \n"  // RGBA next 8 pixels
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
-    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
-  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
+#endif  // HAS_I422TOABGRROW_AVX2
 
-#endif  // HAS_I422TOARGBROW_SSSE3
+#if defined(HAS_I422TORGBAROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
+                               const uint8* u_buf,
+                               const uint8* v_buf,
+                               uint8* dst_argb,
+                               int width) {
+  asm volatile (
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    LABELALIGN
+  "1:                                          \n"
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into RGBA
+    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)  // %[kYuvConstants]
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+  );
+}
+#endif  // HAS_I422TORGBAROW_AVX2
 
 #ifdef HAS_YTOARGBROW_SSE2
-void YToARGBRow_SSE2(const uint8* y_buf,
-                     uint8* dst_argb,
-                     int width) {
+void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
   asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "mov       $0x00100010,%%eax               \n"
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "mov       $0x004a004a,%%eax               \n"
+    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
     "movd      %%eax,%%xmm2                    \n"
     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
+    "movd      %%eax,%%xmm3                    \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "pslld     $0x18,%%xmm4                    \n"
     LABELALIGN
   "1:                                          \n"
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "pmulhuw   %%xmm2,%%xmm0                   \n"
     "psubusw   %%xmm3,%%xmm0                   \n"
-    "pmullw    %%xmm2,%%xmm0                   \n"
     "psrlw     $6, %%xmm0                      \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
 
@@ -2939,8 +2095,8 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     "punpckhwd %%xmm1,%%xmm1                   \n"
     "por       %%xmm4,%%xmm0                   \n"
     "por       %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
 
     "sub       $0x8,%2                         \n"
@@ -2950,13 +2106,58 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     "+rm"(width)     // %2
   :
   : "memory", "cc", "eax"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
   );
 }
 #endif  // HAS_YTOARGBROW_SSE2
 
+#ifdef HAS_YTOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
+  asm volatile (
+    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
+    "vmovd      %%eax,%%xmm2                   \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
+    "vmovd      %%eax,%%xmm3                   \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+    LABELALIGN
+  "1:                                          \n"
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
+    "lea        " MEMLEA(0x10,0) ",%0          \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub        $0x10,%2                       \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(y_buf),     // %0
+    "+r"(dst_argb),  // %1
+    "+rm"(width)     // %2
+  :
+  : "memory", "cc", "eax"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+  );
+}
+#endif  // HAS_YTOARGBROW_AVX2
+
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 static uvec8 kShuffleMirror = {
@@ -2967,38 +2168,56 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
     LABELALIGN
   "1:                                          \n"
-    MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
     "pshufb    %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
   : "m"(kShuffleMirror) // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
   );
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
+#ifdef HAS_MIRRORROW_AVX2
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm5                  \n"
+    LABELALIGN
+  "1:                                          \n"
+    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
+    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kShuffleMirror) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_MIRRORROW_AVX2
+
 #ifdef HAS_MIRRORROW_SSE2
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
     LABELALIGN
   "1:                                          \n"
-    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
+    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
     "movdqa    %%xmm0,%%xmm1                   \n"
     "psllw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
@@ -3006,21 +2225,16 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
     "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1)",%1            \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 #endif  // HAS_MIRRORROW_SSE2
@@ -3035,108 +2249,119 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "movdqa    %4,%%xmm1                       \n"
-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(-0x10,0) ",%0            \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
     "pshufb    %%xmm1,%%xmm0                   \n"
-    "sub       $8,%3                           \n"
     "movlpd    %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $8,%3                           \n"
     "jg        1b                              \n"
   : "+r"(src),      // %0
     "+r"(dst_u),    // %1
     "+r"(dst_v),    // %2
     "+r"(temp_width)  // %3
   : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 #endif  // HAS_MIRRORROW_UV_SSSE3
 
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
+#ifdef HAS_ARGBMIRRORROW_SSE2
 
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
   asm volatile (
     "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
-    "movdqa    %3,%%xmm5                       \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
     "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror)  // %3
+  :
   : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm5"
-#endif
+    , "xmm0"
   );
 }
-#endif  // HAS_ARGBMIRRORROW_SSSE3
+#endif  // HAS_ARGBMIRRORROW_SSE2
 
-#ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+  intptr_t temp_width = (intptr_t)(width);
   asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
+    "vmovdqu    %3,%%ymm5                      \n"
+    LABELALIGN
+  "1:                                          \n"
+    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
+    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "sub        $0x8,%2                        \n"
+    "jg         1b                             \n"
+    "vzeroupper                                \n"
+  : "+r"(src),  // %0
+    "+r"(dst),  // %1
+    "+r"(temp_width)  // %2
+  : "m"(kARGBShuffleMirror_AVX2) // %3
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm5"
+  );
+}
+#endif  // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_AVX2
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+  asm volatile (
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
     "sub        %1,%2                            \n"
     LABELALIGN
   "1:                                            \n"
-    "movdqa     " MEMACCESS(0) ",%%xmm0          \n"
-    "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
-    "lea        " MEMLEA(0x20,0) ",%0            \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)
-    "lea        " MEMLEA(0x10,1) ",%1            \n"
-    "sub        $0x10,%3                         \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
+    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
+    "lea        " MEMLEA(0x40,0) ",%0            \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
+    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
+    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
+    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
+    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
+    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
+    "lea        " MEMLEA(0x20,1) ",%1            \n"
+    "sub        $0x20,%3                         \n"
     "jg         1b                               \n"
+    "vzeroupper                                  \n"
   : "+r"(src_uv),     // %0
     "+r"(dst_u),      // %1
     "+r"(dst_v),      // %2
     "+r"(pix)         // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
+#endif  // HAS_SPLITUVROW_AVX2
 
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix) {
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
     "pcmpeqb    %%xmm5,%%xmm5                    \n"
     "psrlw      $0x8,%%xmm5                      \n"
@@ -3164,52 +2389,46 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "+r"(dst_v),      // %2
     "+r"(pix)         // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
-#ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+#ifdef HAS_MERGEUVROW_AVX2
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   asm volatile (
     "sub       %0,%1                             \n"
     LABELALIGN
   "1:                                            \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0           \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm2                     \n"
-    "punpcklbw %%xmm1,%%xmm0                     \n"
-    "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "           \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"
-    "lea       " MEMLEA(0x20,2) ",%2             \n"
-    "sub       $0x10,%3                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
+    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
+    "lea       " MEMLEA(0x20,0) ",%0             \n"
+    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
+    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
+    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
+    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
+    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
+    "lea       " MEMLEA(0x40,2) ",%2             \n"
+    "sub       $0x20,%3                          \n"
     "jg        1b                                \n"
+    "vzeroupper                                  \n"
   : "+r"(src_u),     // %0
     "+r"(src_v),     // %1
     "+r"(dst_uv),    // %2
     "+r"(width)      // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
   );
 }
+#endif  // HAS_MERGEUVROW_AVX2
 
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width) {
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+                     int width) {
   asm volatile (
     "sub       %0,%1                             \n"
     LABELALIGN
@@ -3230,13 +2449,8 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
     "+r"(dst_uv),    // %2
     "+r"(width)      // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2"
   );
 }
 #endif  // HAS_MERGEUVROW_SSE2
@@ -3246,11 +2460,11 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
@@ -3259,30 +2473,36 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
     "+r"(count)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1"
-#endif
   );
 }
 #endif  // HAS_COPYROW_SSE2
 
-#ifdef HAS_COPYROW_X86
-void CopyRow_X86(const uint8* src, uint8* dst, int width) {
-  size_t width_tmp = (size_t)(width);
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   asm volatile (
-    "shr       $0x2,%2                         \n"
-    "rep movsl " MEMMOVESTRING(0,1) "          \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x40,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(count)  // %2
   :
   : "memory", "cc"
+    , "xmm0", "xmm1"
   );
 }
-#endif  // HAS_COPYROW_X86
+#endif  // HAS_COPYROW_AVX
 
 #ifdef HAS_COPYROW_ERMS
-// Unaligned Multiple of 1.
+// Multiple of 1.
 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   size_t width_tmp = (size_t)(width);
   asm volatile (
@@ -3306,19 +2526,19 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
     "psrld     $0x8,%%xmm1                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
     "pand      %%xmm0,%%xmm2                   \n"
     "pand      %%xmm0,%%xmm3                   \n"
     "pand      %%xmm1,%%xmm4                   \n"
     "pand      %%xmm1,%%xmm5                   \n"
     "por       %%xmm4,%%xmm2                   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -3327,9 +2547,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
@@ -3358,9 +2576,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
@@ -3380,16 +2596,16 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "punpckhwd %%xmm2,%%xmm3                   \n"
     "punpcklwd %%xmm2,%%xmm2                   \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
     "pand      %%xmm0,%%xmm2                   \n"
     "pand      %%xmm0,%%xmm3                   \n"
     "pand      %%xmm1,%%xmm4                   \n"
     "pand      %%xmm1,%%xmm5                   \n"
     "por       %%xmm4,%%xmm2                   \n"
     "por       %%xmm5,%%xmm3                   \n"
-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -3398,9 +2614,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
@@ -3431,18 +2645,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
     "+r"(width)  // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
-#endif
   );
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint32 v32, int width) {
-  size_t width_tmp = (size_t)(width);
+void SetRow_X86(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width >> 2);
+  const uint32 v32 = v8 * 0x01010101;  // Duplicate byte to all bytes.
   asm volatile (
-    "shr       $0x2,%1                         \n"
     "rep stosl " MEMSTORESTRING(eax,0) "       \n"
     : "+D"(dst),       // %0
       "+c"(width_tmp)  // %1
@@ -3450,19 +2662,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) {
     : "memory", "cc");
 }
 
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
-                   int dst_stride, int height) {
-  for (int y = 0; y < height; ++y) {
-    size_t width_tmp = (size_t)(width);
-    uint32* d = (uint32*)(dst);
-    asm volatile (
-      "rep stosl " MEMSTORESTRING(eax,0) "     \n"
-      : "+D"(d),         // %0
-        "+c"(width_tmp)  // %1
-      : "a"(v32)         // %2
-      : "memory", "cc");
-    dst += dst_stride;
-  }
+void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosb " MEMSTORESTRING(al,0) "        \n"
+    : "+D"(dst),       // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v8)          // %2
+    : "memory", "cc");
+}
+
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+  size_t width_tmp = (size_t)(width);
+  asm volatile (
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
+    : "+D"(dst_argb),  // %0
+      "+c"(width_tmp)  // %1
+    : "a"(v32)         // %2
+    : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
@@ -3473,13 +2690,13 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
     "psrlw     $0x8,%%xmm5                     \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
@@ -3488,9 +2705,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
     "+r"(pix)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
-#endif
   );
 }
 
@@ -3502,11 +2717,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
@@ -3519,7 +2733,6 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
@@ -3529,13 +2742,8 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -3547,8 +2755,8 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
@@ -3559,7 +2767,6 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
@@ -3569,47 +2776,36 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix) {
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
+  : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+    , "xmm0", "xmm1"
   );
 }
 
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
-                                int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
@@ -3618,14 +2814,13 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pand      %%xmm5,%%xmm0                   \n"
@@ -3633,28 +2828,22 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
+  : "+r"(src_uyvy),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "r"((intptr_t)(stride_uyvy))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
@@ -3664,8 +2853,8 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pand      %%xmm5,%%xmm0                   \n"
@@ -3673,247 +2862,226 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
     "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
+  : "+r"(src_uyvy),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
+#endif  // HAS_YUY2TOYROW_SSE2
 
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+#ifdef HAS_YUY2TOYROW_AVX2
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) {
   asm volatile (
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+    , "xmm0", "xmm1", "xmm5"
   );
 }
 
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
     "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "r"((intptr_t)(stride_yuy2))  // %4
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
     "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
+    "vzeroupper                                \n"
+  : "+r"(src_yuy2),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix) {
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "lea      " MEMLEA(0x20,1) ",%1            \n"
+    "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+    , "xmm0", "xmm1", "xmm5"
   );
 }
-
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
     "sub       %1,%2                           \n"
+
     LABELALIGN
   "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
+    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_uyvy),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
     "sub       %1,%2                           \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    BUNDLEALIGN
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
+    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
+    "lea      " MEMLEA(0x10,1) ",%1            \n"
+    "sub       $0x20,%3                        \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_uyvy),    // %0
     "+r"(dst_u),       // %1
     "+r"(dst_v),       // %2
     "+r"(pix)          // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
-#endif  // HAS_YUY2TOYROW_SSE2
+#endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_ARGBBLENDROW_SSE2
 // Blend 8 pixels at a time.
@@ -3956,9 +3124,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
     "jge       10b                             \n"
 
   "19:                                         \n"
@@ -3988,9 +3156,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jge       41b                             \n"
 
   "49:                                         \n"
@@ -4019,9 +3187,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
     "jge       91b                             \n"
   "99:                                         \n"
   : "+r"(src_argb0),    // %0
@@ -4030,9 +3198,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "+r"(width)         // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBBLENDROW_SSE2
@@ -4091,49 +3257,18 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
     "jge       10b                             \n"
 
   "19:                                         \n"
     "add       $1-4,%3                         \n"
     "jl        49f                             \n"
-    "test      $0xf,%0                         \n"
-    "jne       41f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       41f                             \n"
 
     // 4 pixel loop.
     LABELALIGN
   "40:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "jge       40b                             \n"
-    "jmp       49f                             \n"
-
-    // 4 pixel unaligned loop.
-    LABELALIGN
-  "41:                                         \n"
     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
@@ -4152,10 +3287,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "jge       41b                             \n"
+    "sub       $0x4,%3                         \n"
+    "jge       40b                             \n"
 
   "49:                                         \n"
     "add       $0x3,%3                         \n"
@@ -4181,9 +3316,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "paddusb   %%xmm2,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x1,%3                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x4,2) ",%2            \n"
+    "sub       $0x1,%3                         \n"
     "jge       91b                             \n"
   "99:                                         \n"
   : "+r"(src_argb0),    // %0
@@ -4192,16 +3327,13 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "+r"(width)         // %3
   : "m"(kShuffleAlpha)  // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
-// aligned to 16 bytes
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
@@ -4212,17 +3344,17 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     // 4 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "pand      %%xmm4,%%xmm2                   \n"
@@ -4230,18 +3362,16 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     "packuswb  %%xmm1,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "por       %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
     "+r"(width)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBATTENUATEROW_SSE2
@@ -4249,14 +3379,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
 static uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
 };
 static uvec8 kShuffleAlpha1 = {
   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
 };
 // Attenuate 4 pixels at a time.
-// aligned to 16 bytes
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
     "pcmpeqb   %%xmm3,%%xmm3                   \n"
@@ -4284,9 +3413,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "por       %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
@@ -4294,16 +3423,56 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   : "m"(kShuffleAlpha0),  // %3
     "m"(kShuffleAlpha1)  // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
+};
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    "vbroadcastf128 %3,%%ymm4                  \n"
+    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+    "sub        %0,%1                          \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width)        // %2
+  : "m"(kShuffleAlpha_AVX2)  // %3
+  : "memory", "cc"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+  );
+}
+#endif  // HAS_ARGBATTENUATEROW_AVX2
+
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-// aligned to 16 bytes
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   uintptr_t alpha = 0;
@@ -4324,7 +3493,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
     "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
     MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
@@ -4334,26 +3502,90 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "pmulhuw   %%xmm2,%%xmm1                   \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
     "+r"(width),       // %2
     "+r"(alpha)        // %3
   : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
+};
+// Unattenuate 8 pixels at a time.
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                             int width) {
+  uintptr_t alpha = 0;
+  asm volatile (
+    "sub        %0,%1                          \n"
+    "vbroadcastf128 %5,%%ymm5                  \n"
+
+    // 8 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    // replace VPGATHER
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
+    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
+    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
+    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
+    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
+    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+    // end of VPGATHER
+
+    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
+    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub        $0x8,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(width),       // %2
+    "+r"(alpha)        // %3
+  : "r"(fixed_invtbl8),  // %4
+    "m"(kUnattenShuffleAlpha_AVX2)  // %5
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_ARGBUNATTENUATEROW_AVX2
+
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -4364,16 +3596,16 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "paddw     %%xmm5,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrld     $0x18,%%xmm2                    \n"
     "psrld     $0x18,%%xmm3                    \n"
@@ -4385,10 +3617,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm3,%%xmm0                   \n"
     "punpckhwd %%xmm3,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
@@ -4396,9 +3628,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   : "m"(kARGBToYJ),   // %3
     "m"(kAddYJ64)     // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBGRAYROW_SSSE3
@@ -4430,30 +3660,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
     "pmaddubsw %%xmm2,%%xmm6                   \n"
     "phaddw    %%xmm6,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm3,%%xmm5                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm5                   \n"
     "psrlw     $0x7,%%xmm5                     \n"
     "packuswb  %%xmm5,%%xmm5                   \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm4,%%xmm5                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm5                   \n"
     "psrlw     $0x7,%%xmm5                     \n"
     "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "psrld     $0x18,%%xmm6                    \n"
     "psrld     $0x18,%%xmm1                    \n"
     "packuswb  %%xmm1,%%xmm6                   \n"
@@ -4462,10 +3692,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm5,%%xmm0                   \n"
     "punpckhwd %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%1                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%1                         \n"
     "jg        1b                              \n"
   : "+r"(dst_argb),      // %0
     "+r"(width)          // %1
@@ -4473,9 +3703,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     "m"(kARGBToSepiaG),  // %3
     "m"(kARGBToSepiaR)   // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
   );
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
@@ -4495,12 +3723,12 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
     "pmaddubsw %%xmm2,%%xmm7                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm3,%%xmm6                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
     "phaddsw   %%xmm7,%%xmm0                   \n"
@@ -4510,13 +3738,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
     "punpcklbw %%xmm6,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm7                   \n"
     "phaddsw   %%xmm7,%%xmm1                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm5,%%xmm6                   \n"
     "pmaddubsw %%xmm5,%%xmm7                   \n"
     "phaddsw   %%xmm7,%%xmm6                   \n"
@@ -4528,27 +3756,24 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     "movdqa    %%xmm0,%%xmm6                   \n"
     "punpcklwd %%xmm1,%%xmm0                   \n"
     "punpckhwd %%xmm1,%%xmm6                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),      // %0
     "+r"(dst_argb),      // %1
     "+r"(width)          // %2
   : "r"(matrix_argb)     // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-// aligned to 16 bytes
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
   asm volatile (
@@ -4568,23 +3793,23 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
     // 4 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "punpckhbw %%xmm5,%%xmm1                   \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
     "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm7         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
     "pand      %%xmm6,%%xmm7                   \n"
     "paddw     %%xmm4,%%xmm0                   \n"
     "paddw     %%xmm4,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "por       %%xmm7,%%xmm0                   \n"
-    "sub       $0x4,%1                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "sub       $0x4,%1                         \n"
     "jg        1b                              \n"
   : "+r"(dst_argb),       // %0
     "+r"(width)           // %1
@@ -4592,16 +3817,13 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
     "r"(interval_size),   // %3
     "r"(interval_offset)  // %4
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
   );
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   asm volatile (
@@ -4612,7 +3834,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     // 4 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -4622,18 +3844,16 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   : "r"(value)       // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
-#endif
   );
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
@@ -4643,7 +3863,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
+    "pxor      %%xmm5,%%xmm5                  \n"
 
     // 4 pixel loop.
     LABELALIGN
@@ -4661,9 +3881,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pmulhuw   %%xmm2,%%xmm0                   \n"
     "pmulhuw   %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4671,13 +3891,50 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "+r"(width)       // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea       " MEMLEA(0x20,2) ",%2           \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+#if defined(__AVX2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBMULTIPLYROW_AVX2
+
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
@@ -4691,9 +3948,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4701,13 +3958,39 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "+r"(width)       // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1"
-#endif
   );
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                     uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBADDROW_AVX2
+
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
@@ -4721,9 +4004,9 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "psubusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4731,13 +4014,39 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "+r"(width)       // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1"
-#endif
   );
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+                          uint8* dst_argb, int width) {
+  asm volatile (
+    // 4 pixel loop.
+    LABELALIGN
+  "1:                                          \n"
+    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
+    "lea        " MEMLEA(0x20,0) ",%0          \n"
+    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
+    "lea        " MEMLEA(0x20,1) ",%1          \n"
+    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
+    "lea        " MEMLEA(0x20,2) ",%2          \n"
+    "sub        $0x8,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb0),  // %0
+    "+r"(src_argb1),  // %1
+    "+r"(dst_argb),   // %2
+    "+r"(width)       // %3
+  :
+  : "memory", "cc"
+    , "xmm0"
+  );
+}
+#endif  // HAS_ARGBSUBTRACTROW_AVX2
+
 #ifdef HAS_SOBELXROW_SSE2
 // SobelX as a matrix is
 // -1  0  1
@@ -4759,13 +4068,11 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "psubw     %%xmm1,%%xmm0                   \n"
-    BUNDLEALIGN
     MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
     MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "psubw     %%xmm2,%%xmm1                   \n"
-    BUNDLEALIGN
     MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
     MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
     "punpcklbw %%xmm5,%%xmm2                   \n"
@@ -4778,10 +4085,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     "psubw     %%xmm0,%%xmm1                   \n"
     "pmaxsw    %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "sub       $0x8,%4                         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
     "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%4                         \n"
     "jg        1b                              \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -4789,13 +4095,8 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     "+r"(dst_sobelx),  // %3
     "+r"(width)        // %4
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SOBELXROW_SSE2
@@ -4820,13 +4121,11 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "psubw     %%xmm1,%%xmm0                   \n"
-    BUNDLEALIGN
     "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
     MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "psubw     %%xmm2,%%xmm1                   \n"
-    BUNDLEALIGN
     "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
     MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
     "punpcklbw %%xmm5,%%xmm2                   \n"
@@ -4839,23 +4138,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     "psubw     %%xmm0,%%xmm1                   \n"
     "pmaxsw    %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "sub       $0x8,%3                         \n"
-    BUNDLEALIGN
     MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
     "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x8,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
     "+r"(dst_sobely),  // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SOBELYROW_SSE2
@@ -4876,8 +4169,8 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
@@ -4893,25 +4186,20 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     "punpckhwd %%xmm0,%%xmm0                   \n"
     "por       %%xmm5,%%xmm3                   \n"
     "por       %%xmm5,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm1," MEMACCESS(2) "         \n"
-    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"
-    "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
     "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 #endif  // HAS_SOBELROW_SSE2
@@ -4928,26 +4216,21 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_y),       // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 #endif  // HAS_SOBELTOPLANEROW_SSE2
@@ -4967,8 +4250,8 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     // 8 pixel loop.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "paddusb   %%xmm1,%%xmm2                   \n"
@@ -4984,25 +4267,20 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     "movdqa    %%xmm1,%%xmm7                   \n"
     "punpcklwd %%xmm0,%%xmm7                   \n"
     "punpckhwd %%xmm0,%%xmm1                   \n"
-    "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm6," MEMACCESS(2) "         \n"
-    "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"
-    "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
     "lea       " MEMLEA(0x40,2) ",%2           \n"
+    "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
     "+r"(dst_argb),    // %2
     "+r"(width)        // %3
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 #endif  // HAS_SOBELXYROW_SSE2
@@ -5035,22 +4313,22 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     "punpcklwd %%xmm1,%%xmm4                   \n"
     "punpckhwd %%xmm1,%%xmm5                   \n"
     "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqa    " MEMACCESS(2) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
     "paddd     %%xmm0,%%xmm2                   \n"
     "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
+    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
     "paddd     %%xmm0,%%xmm3                   \n"
     "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
+    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
     "paddd     %%xmm0,%%xmm4                   \n"
     "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
     "lea       " MEMLEA(0x40,2) ",%2           \n"
     "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"
-    "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
     "sub       $0x4,%3                         \n"
     "jge       40b                             \n"
@@ -5082,9 +4360,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     "+r"(width)  // %3
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
   );
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
@@ -5115,11 +4391,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   // 4 pixel small loop                        \n"
     LABELALIGN
   "4:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    BUNDLEALIGN
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
@@ -5129,7 +4404,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    BUNDLEALIGN
     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
@@ -5149,11 +4423,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   // 4 pixel loop                              \n"
     LABELALIGN
   "40:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    BUNDLEALIGN
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
     MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
     MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
@@ -5163,7 +4436,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
     "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
     "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    BUNDLEALIGN
     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
     MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
     MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
@@ -5196,11 +4468,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   // 1 pixel loop                              \n"
     LABELALIGN
   "10:                                         \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    BUNDLEALIGN
     MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
@@ -5219,13 +4490,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     "+rm"(count)    // %3
   : "r"((intptr_t)(width)),  // %4
     "rm"(area)     // %5
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@@ -5268,7 +4534,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
     "movd      %%xmm0,%k5                      \n"
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
     "punpckldq %%xmm6,%%xmm1                   \n"
@@ -5277,14 +4542,13 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     "movd      %%xmm0,%k1                      \n"
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
     "movd      %%xmm0,%k5                      \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
     MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
     "punpckldq %%xmm6,%%xmm0                   \n"
     "addps     %%xmm4,%%xmm3                   \n"
-    "sub       $0x4,%4                         \n"
     "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
     "jge       40b                             \n"
 
   "49:                                         \n"
@@ -5299,11 +4563,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     "pmaddwd   %%xmm5,%%xmm0                   \n"
     "addps     %%xmm7,%%xmm2                   \n"
     "movd      %%xmm0,%k1                      \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    "sub       $0x1,%4                         \n"
     "movd      %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x04,2) ",%2           \n"
+    "sub       $0x1,%4                         \n"
     "jge       10b                             \n"
   "19:                                         \n"
   : "+r"(src_argb),  // %0
@@ -5313,13 +4576,8 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     "+rm"(width),    // %4
     "+r"(temp)   // %5
   :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
@@ -5352,8 +4610,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     // General purpose row blend.
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm2)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
@@ -5362,61 +4620,57 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
     // Blend 25 / 75.
     LABELALIGN
   "25:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
     // Blend 50 / 50.
     LABELALIGN
   "50:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
     // Blend 75 / 25.
     LABELALIGN
   "75:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm0)
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
     // Blend 100 / 0 - Copy row unchanged.
     LABELALIGN
   "100:                                        \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -5425,147 +4679,22 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
   );
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 32x2 -> 32x1
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) {
   asm volatile (
-    "sub       %1,%0                           \n"
     "shr       %3                              \n"
     "cmp       $0x0,%3                         \n"
     "je        100f                            \n"
-    "cmp       $0x20,%3                        \n"
-    "je        75f                             \n"
-    "cmp       $0x40,%3                        \n"
-    "je        50f                             \n"
-    "cmp       $0x60,%3                        \n"
-    "je        25f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pxor      %%xmm4,%%xmm4                   \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm2                   \n"
-    "psubw     %%xmm1,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm2                   \n"
-    "paddw     %%xmm3,%%xmm3                   \n"
-    "pmulhw    %%xmm5,%%xmm2                   \n"
-    "pmulhw    %%xmm5,%%xmm3                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 25 / 75.
-    LABELALIGN
-  "25:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        25b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 75 / 25.
-    LABELALIGN
-  "75:                                         \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        75b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_INTERPOLATEROW_SSE2
-
-#ifdef HAS_INTERPOLATEROW_SSSE3
-// Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride, int dst_width,
-                                    int source_y_fraction) {
-  asm volatile (
     "sub       %1,%0                           \n"
-    "shr       %3                              \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
     "cmp       $0x20,%3                        \n"
     "je        75f                             \n"
     "cmp       $0x40,%3                        \n"
@@ -5573,106 +4702,95 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "cmp       $0x60,%3                        \n"
     "je        25f                             \n"
 
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x80,%3                        \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "vmovd      %3,%%xmm0                      \n"
+    "neg        %3                             \n"
+    "add        $0x80,%3                       \n"
+    "vmovd      %3,%%xmm5                      \n"
+    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
 
     // General purpose row blend.
     LABELALIGN
   "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "pmaddubsw %%xmm5,%%xmm0                   \n"
-    "pmaddubsw %%xmm5,%%xmm1                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
+    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
+    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
+    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
+    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
     // Blend 25 / 75.
     LABELALIGN
   "25:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
     // Blend 50 / 50.
     LABELALIGN
   "50:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
+    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
     // Blend 75 / 25.
     LABELALIGN
   "75:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
+    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
+    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x20,%2                        \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
     // Blend 100 / 0 - Copy row unchanged.
     LABELALIGN
   "100:                                        \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "jg        100b                            \n"
+    "rep movsb " MEMMOVESTRING(1,0) "          \n"
+    "jmp       999f                            \n"
 
   "99:                                         \n"
-  : "+r"(dst_ptr),    // %0
-    "+r"(src_ptr),    // %1
-    "+r"(dst_width),  // %2
+    "vzeroupper                                \n"
+  "999:                                        \n"
+  : "+D"(dst_ptr),    // %0
+    "+S"(src_ptr),    // %1
+    "+c"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm5"
   );
 }
-#endif   // HAS_INTERPOLATEROW_SSSE3
+#endif  // HAS_INTERPOLATEROW_AVX2
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride, int dst_width,
-                                   int source_y_fraction) {
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
     "shr       %3                              \n"
@@ -5699,8 +4817,8 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   "1:                                          \n"
     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,%%xmm3                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm2                   \n"
     "punpckhbw %%xmm4,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm0                   \n"
@@ -5714,10 +4832,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
@@ -5728,10 +4845,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
@@ -5741,10 +4857,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
@@ -5755,10 +4870,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    BUNDLEALIGN
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
@@ -5766,9 +4880,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     LABELALIGN
   "100:                                        \n"
     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "sub       $0x10,%2                        \n"
     MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -5777,73 +4891,12 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
-#ifdef HAS_HALFROW_SSE2
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0
-    "sub       $0x10,%2                        \n"
-    MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "jg        1b                              \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_uv),  // %1
-    "+r"(pix)      // %2
-  : "r"((intptr_t)(src_uv_stride))  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-      , "xmm0"
-#endif
-  );
-}
-#endif  // HAS_HALFROW_SSE2
-
-#ifdef HAS_ARGBTOBAYERROW_SSSE3
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                          uint32 selector, int pix) {
-  asm volatile (
-    // NaCL caveat - assumes movd is from GPR
-    "movd      %3,%%xmm5                       \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "sub       $0x8,%2                         \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_bayer), // %1
-    "+r"(pix)        // %2
-  : "g"(selector)    // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-#endif  // HAS_ARGBTOBAYERROW_SSSE3
-
 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                            uint32 selector, int pix) {
@@ -5852,8 +4905,8 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
     "psrld     $0x18,%%xmm5                    \n"
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrld     $0x8,%%xmm0                     \n"
     "psrld     $0x8,%%xmm1                     \n"
@@ -5861,18 +4914,16 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
     "pand      %%xmm5,%%xmm1                   \n"
     "packssdw  %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x8,%2                         \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_bayer), // %1
     "+r"(pix)        // %2
   :
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBTOBAYERGGROW_SSE2
@@ -5882,34 +4933,7 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
   asm volatile (
-    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(pix)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                    const uint8* shuffler, int pix) {
-  asm volatile (
-    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
@@ -5917,19 +4941,17 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "r"(shuffler)    // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
@@ -5947,19 +4969,18 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     "lea       " MEMLEA(0x40,0) ",%0           \n"
     "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
     "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-    "sub       $0x10,%2                        \n"
     "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
     "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
     "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
+    "vzeroupper                                \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(pix)        // %2
   : "r"(shuffler)    // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
-#endif
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
@@ -5989,7 +5010,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
     "mov       %b2," MEMACCESS2(0x1,1) "       \n"
-    BUNDLEALIGN
     "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
     MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
     "mov       %b2," MEMACCESS2(0x2,1) "       \n"
@@ -6014,9 +5034,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
     "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        123b                            \n"
     "jmp       99f                             \n"
 
@@ -6032,9 +5052,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
     "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        321b                            \n"
     "jmp       99f                             \n"
 
@@ -6050,9 +5070,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
     "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        2103b                           \n"
     "jmp       99f                             \n"
 
@@ -6068,9 +5088,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
     "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%3                         \n"
     "jg        3012b                           \n"
 
   "99:                                         \n"
@@ -6079,13 +5099,8 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "+d"(pixel_temp),  // %2
     "+r"(pix)         // %3
   : "r"(shuffler)      // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm5"
   );
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSE2
@@ -6119,13 +5134,8 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
       "+r"(dst_frame),  // %3
       "+rm"(width)  // %4
     :
-    : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
@@ -6159,13 +5169,8 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
       "+r"(dst_frame),  // %3
       "+rm"(width)  // %4
     :
-    : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+    : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
@@ -6212,18 +5217,16 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
     "cvttps2dq %%xmm4,%%xmm4                   \n"
     "packuswb  %%xmm4,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "sub       $0x2,%2                         \n"
     "movq      %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x2,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   : "r"(poly)        // %3
   : "memory", "cc"
-#if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
   );
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
@@ -6253,20 +5256,17 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
     "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
     "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
     "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-    "sub         $0x2,%2                       \n"
     "vmovq       %%xmm0," MEMACCESS(1) "       \n"
     "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "sub         $0x2,%2                       \n"
     "jg          1b                            \n"
     "vzeroupper                                \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(width)      // %2
   : "r"(poly)        // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-// TODO(fbarchard): declare ymm usage when applicable.
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc",
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
@@ -6376,7 +5376,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
     "mov       %b0," MEMACCESS2(0x4,3) "       \n"
-    BUNDLEALIGN
     "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
     MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
     "mov       %b0," MEMACCESS2(0x5,3) "       \n"
@@ -6416,9 +5415,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     "mov       %b0," MEMACCESS2(0xe,3) "       \n"
     "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
     "mov       %b0," MEMACCESS2(0xf,3) "       \n"
-    "sub       $0x4,%4                         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
     "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "sub       $0x4,%4                         \n"
     "jg        1b                              \n"
   : "+d"(pixel_temp),  // %0
     "+a"(table_temp),  // %1
@@ -6427,10 +5426,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     "+rm"(width)       // %4
   : "r"(luma),         // %5
     "rm"(lumacoeff)    // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm3", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc
index d79c353960b..6e9d04c0e4e 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/row_win.cc
@@ -24,55 +24,63 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
     (defined(_M_IX86) || defined(_M_X64))
 
-#define YG 74  /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127  /* min(127,(int8)(2.018 * 64)) */
-#define UG -25  /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52  /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102  /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
-static const vec8 kUVToB = {
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B.  Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128            - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR            (VR * 128 - YGB)
+
+struct YuvConstants {
+  lvec8 kUVToB;     // 0
+  lvec8 kUVToG;     // 32
+  lvec8 kUVToR;     // 64
+  lvec16 kUVBiasB;  // 96
+  lvec16 kUVBiasG;  // 128
+  lvec16 kUVBiasR;  // 160
+  lvec16 kYToRgb;   // 192
 };
 
-static const vec8 kVUToB = {
-  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+// BT601 constants for YUV to RGB.
+static YuvConstants SIMD_ALIGNED(kYuvConstants) = {
+  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 };
 
-static const vec8 kVUToR = {
-  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+// BT601 constants for NV21 where chroma plane is VU instead of UV.
+static YuvConstants SIMD_ALIGNED(kYvuConstants) = {
+  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
+  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
+  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
+  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
+  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
 };
 
-static const vec8 kVUToG = {
-  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-
 // 64 bit
 #if defined(_M_X64)
 
-// Aligned destination version.
 __declspec(align(16))
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* u_buf,
@@ -81,7 +89,6 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
                          int width) {
   __m128i xmm0, xmm1, xmm2, xmm3;
   const __m128i xmm5 = _mm_set1_epi8(-1);
-  const __m128i xmm4 = _mm_setzero_si128();
   const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
 
   while (width > 0) {
@@ -89,18 +96,17 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
     xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
     xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
-    xmm1 = _mm_load_si128(&xmm0);
-    xmm2 = _mm_load_si128(&xmm0);
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
-    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
-    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
-    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+    xmm1 = _mm_loadu_si128(&xmm0);
+    xmm2 = _mm_loadu_si128(&xmm0);
+    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB);
+    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG);
+    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR);
+    xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0);
+    xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1);
+    xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2);
     xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
-    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
-    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
-    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+    xmm3 = _mm_unpacklo_epi8(xmm3, xmm3);
+    xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb);
     xmm0 = _mm_adds_epi16(xmm0, xmm3);
     xmm1 = _mm_adds_epi16(xmm1, xmm3);
     xmm2 = _mm_adds_epi16(xmm2, xmm3);
@@ -112,60 +118,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
     xmm2 = _mm_packus_epi16(xmm2, xmm2);
     xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
     xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
-    xmm1 = _mm_load_si128(&xmm0);
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
-
-    _mm_store_si128((__m128i *)dst_argb, xmm0);
-    _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
-
-    y_buf += 8;
-    u_buf += 4;
-    dst_argb += 32;
-    width -= 8;
-  }
-}
-
-// Unaligned destination version.
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __m128i xmm0, xmm1, xmm2, xmm3;
-  const __m128i xmm5 = _mm_set1_epi8(-1);
-  const __m128i xmm4 = _mm_setzero_si128();
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
-
-  while (width > 0) {
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
-    xmm1 = _mm_load_si128(&xmm0);
-    xmm2 = _mm_load_si128(&xmm0);
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
-    xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
-    xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
-    xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
-    xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
-    xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
-    xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
-    xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
-    xmm0 = _mm_adds_epi16(xmm0, xmm3);
-    xmm1 = _mm_adds_epi16(xmm1, xmm3);
-    xmm2 = _mm_adds_epi16(xmm2, xmm3);
-    xmm0 = _mm_srai_epi16(xmm0, 6);
-    xmm1 = _mm_srai_epi16(xmm1, 6);
-    xmm2 = _mm_srai_epi16(xmm2, 6);
-    xmm0 = _mm_packus_epi16(xmm0, xmm0);
-    xmm1 = _mm_packus_epi16(xmm1, xmm1);
-    xmm2 = _mm_packus_epi16(xmm2, xmm2);
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
-    xmm1 = _mm_load_si128(&xmm0);
+    xmm1 = _mm_loadu_si128(&xmm0);
     xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
     xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
 
@@ -178,6 +131,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     width -= 8;
   }
 }
+
 // 32 bit
 #else  // defined(_M_X64)
 
@@ -209,15 +163,10 @@ static const vec8 kARGBToVJ = {
   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
 };
 
-// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
-
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
   0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
 };
 
 // Constants for BGRA.
@@ -263,6 +212,7 @@ static const uvec8 kAddY16 = {
   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
+// 7 bit fixed point 0.5.
 static const vec16 kAddYJ64 = {
   64, 64, 64, 64, 64, 64, 64, 64
 };
@@ -316,36 +266,6 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
 
-    align      4
-  convertloop:
-    movq       xmm0, qword ptr [eax]
-    lea        eax,  [eax + 8]
-    punpcklbw  xmm0, xmm0
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0
-    punpckhwd  xmm1, xmm1
-    por        xmm0, xmm5
-    por        xmm1, xmm5
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
-                                  int pix) {
-  __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
-    pslld      xmm5, 24
-
-    align      4
   convertloop:
     movq       xmm0, qword ptr [eax]
     lea        eax,  [eax + 8]
@@ -374,7 +294,6 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRGB24ToARGB
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -386,18 +305,18 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     por       xmm2, xmm5
     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
-    movdqa    [edx + 32], xmm2
+    movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     por       xmm1, xmm5
     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
-    movdqa    [edx + 16], xmm1
+    movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
-    sub       ecx, 16
-    movdqa    [edx + 48], xmm3
+    movdqu    [edx + 48], xmm3
     lea       edx, [edx + 64]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
@@ -414,7 +333,6 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRAWToARGB
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -426,18 +344,18 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
     por       xmm2, xmm5
     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
-    movdqa    [edx + 32], xmm2
+    movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     por       xmm1, xmm5
     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
-    movdqa    [edx + 16], xmm1
+    movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
-    sub       ecx, 16
-    movdqa    [edx + 48], xmm3
+    movdqu    [edx + 48], xmm3
     lea       edx, [edx + 64]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
@@ -474,7 +392,6 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
     sub       edx, eax
     sub       edx, eax
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
@@ -491,8 +408,8 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -524,7 +441,6 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
     sub       edx, eax
     sub       edx, eax
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
@@ -545,8 +461,8 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
-    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -570,7 +486,6 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
     sub       edx, eax
     sub       edx, eax
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
@@ -585,8 +500,8 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
     movdqa    xmm1, xmm0
     punpcklbw xmm0, xmm2
     punpckhbw xmm1, xmm2
-    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
-    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
+    movdqu    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
+    movdqu    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
     lea       eax, [eax + 16]
     sub       ecx, 8
     jg        convertloop
@@ -602,7 +517,6 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRGB24
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
@@ -641,7 +555,6 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRAW
 
-    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
@@ -686,9 +599,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
     pslld     xmm5, 11
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
     movdqa    xmm2, xmm0    // G
     pslld     xmm0, 8       // R
@@ -726,9 +638,8 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
     pslld     xmm7, 15
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
     movdqa    xmm2, xmm0    // G
     movdqa    xmm3, xmm0    // R
@@ -764,14 +675,13 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
     pand      xmm0, xmm3    // low nibble
     pand      xmm1, xmm4    // high nibble
-    psrl      xmm0, 4
-    psrl      xmm1, 8
+    psrld     xmm0, 4
+    psrld     xmm1, 8
     por       xmm0, xmm1
     packuswb  xmm0, xmm0
     lea       eax, [eax + 16]
@@ -783,6 +693,116 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
   }
 }
 
+#ifdef HAS_ARGBTORGB565ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpsrld     ymm3, ymm3, 27
+    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpsrld     ymm4, ymm4, 26
+    vpslld     ymm4, ymm4, 5
+    vpcmpeqb   ymm5, ymm5, ymm5    // generate mask 0xfffff800
+    vpslld     ymm5, ymm5, 11
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpslld     ymm0, ymm0, 8       // R
+    vpand      ymm2, ymm2, ymm4    // G
+    vpand      ymm1, ymm1, ymm3    // B
+    vpsrad     ymm0, ymm0, 16      // R
+    vpand      ymm0, ymm0, ymm5    // R
+    vpor       ymm1, ymm1, ymm2    // BG
+    vpor       ymm0, ymm0, ymm1    // BGR
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTORGB565ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB1555ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]      // src_argb
+    mov        edx, [esp + 8]      // dst_rgb
+    mov        ecx, [esp + 12]     // pix
+    vpcmpeqb   ymm4, ymm4, ymm4
+    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpslld     ymm7, ymm7, 15
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9       // R
+    vpsrld     ymm2, ymm0, 6       // G
+    vpsrld     ymm1, ymm0, 3       // B
+    vpsrad     ymm0, ymm0, 16      // A
+    vpand      ymm3, ymm3, ymm6    // R
+    vpand      ymm2, ymm2, ymm5    // G
+    vpand      ymm1, ymm1, ymm4    // B
+    vpand      ymm0, ymm0, ymm7    // A
+    vpor       ymm0, ymm0, ymm1    // BA
+    vpor       ymm2, ymm2, ymm3    // GR
+    vpor       ymm0, ymm0, ymm2    // BGRA
+    vpackssdw  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB1555ROW_AVX2
+
+#ifdef HAS_ARGBTOARGB4444ROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+  __asm {
+    mov        eax, [esp + 4]   // src_argb
+    mov        edx, [esp + 8]   // dst_rgb
+    mov        ecx, [esp + 12]  // pix
+    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpsllw     ymm4, ymm4, 12
+    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+
+ convertloop:
+    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4    // high nibble
+    vpand      ymm0, ymm0, ymm3    // low nibble
+    vpsrld     ymm1, ymm1, 8
+    vpsrld     ymm0, ymm0, 4
+    vpor       ymm0, ymm0, ymm1
+    vpackuswb  ymm0, ymm0, ymm0
+    vpermq     ymm0, ymm0, 0xd8
+    lea        eax, [eax + 32]
+    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    lea        edx, [edx + 16]
+    sub        ecx, 8
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBTOARGB4444ROW_AVX2
+
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 __declspec(naked) __declspec(align(16))
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
@@ -790,15 +810,14 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
+    movdqa     xmm5, kAddY16
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
@@ -810,15 +829,16 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
-// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
+// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
 __declspec(naked) __declspec(align(16))
 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
@@ -828,12 +848,11 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
@@ -846,15 +865,20 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     psrlw      xmm0, 7
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
 #ifdef HAS_ARGBTOYROW_AVX2
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kPermdARGBToY_AVX = {
+  0, 4, 1, 5, 2, 6, 3, 7
+};
+
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 __declspec(naked) __declspec(align(32))
 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
@@ -864,9 +888,8 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToY
     vbroadcastf128 ymm5, kAddY16
-    vmovdqa    ymm6, kPermdARGBToY_AVX
+    vmovdqu    ymm6, kPermdARGBToY_AVX
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -883,10 +906,10 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
     vpsrlw     ymm2, ymm2, 7
     vpackuswb  ymm0, ymm0, ymm2  // mutates.
     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    vpaddb     ymm0, ymm0, ymm5
-    sub        ecx, 32
+    vpaddb     ymm0, ymm0, ymm5  // add 16 for Y
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
     vzeroupper
     ret
@@ -904,9 +927,8 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
     mov        ecx, [esp + 12]  /* pix */
     vbroadcastf128 ymm4, kARGBToYJ
     vbroadcastf128 ymm5, kAddYJ64
-    vmovdqa    ymm6, kPermdARGBToY_AVX
+    vmovdqu    ymm6, kPermdARGBToY_AVX
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -925,9 +947,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
     vpsrlw     ymm2, ymm2, 7
     vpackuswb  ymm0, ymm0, ymm2  // mutates.
     vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.
-    sub        ecx, 32
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
 
     vzeroupper
@@ -937,118 +959,14 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
 __declspec(naked) __declspec(align(16))
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
-    movdqa     xmm4, kARGBToY
-
-    align      4
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm4, kARGBToYJ
-    movdqa     xmm5, kAddYJ64
-
-    align      4
- convertloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    paddw      xmm0, xmm5
-    paddw      xmm2, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   __asm {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
-
-    align      4
- convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
-    movdqa     xmm4, kBGRAToY
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1065,9 +983,9 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
@@ -1079,44 +997,9 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
-
-    align      4
- convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
-    movdqa     xmm4, kABGRToY
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1133,9 +1016,9 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
@@ -1147,44 +1030,9 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
-
-    align      4
- convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm4
-    pmaddubsw  xmm1, xmm4
-    pmaddubsw  xmm2, xmm4
-    pmaddubsw  xmm3, xmm4
-    lea        eax, [eax + 64]
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psrlw      xmm0, 7
-    psrlw      xmm2, 7
-    packuswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* pix */
     movdqa     xmm5, kAddY16
-    movdqa     xmm4, kRGBAToY
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1201,9 +1049,9 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     psrlw      xmm2, 7
     packuswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
@@ -1220,22 +1068,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1263,10 +1115,10 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1286,22 +1138,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToUJ
-    movdqa     xmm6, kARGBToVJ
     movdqa     xmm5, kAddUVJ128
+    movdqa     xmm6, kARGBToVJ
+    movdqa     xmm7, kARGBToUJ
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1330,10 +1186,10 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     packsswb   xmm0, xmm1
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1359,7 +1215,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vbroadcastf128 ymm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
@@ -1395,10 +1250,10 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
 
     // step 3 - store 16 U and 16 V values
-    sub         ecx, 32
     vextractf128 [edx], ymm0, 0 // U
     vextractf128 [edx + edi], ymm0, 1 // V
     lea        edx, [edx + 16]
+    sub        ecx, 32
     jg         convertloop
 
     pop        edi
@@ -1410,147 +1265,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 __declspec(naked) __declspec(align(16))
-void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kARGBToUJ
-    movdqa     xmm6, kARGBToVJ
-    movdqa     xmm5, kAddUVJ128
-    sub        edi, edx             // stride from u to v
-
-    align      4
- convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    paddw      xmm0, xmm5            // +.5 rounding -> unsigned
-    paddw      xmm1, xmm5
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-
-    // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
                           uint8* dst_u, uint8* dst_v, int width) {
   __asm {
@@ -1559,70 +1273,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
     movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
-
-    align      4
- convertloop:
-    /* convert to U and V */
-    movdqa     xmm0, [eax]          // U
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm7
-    pmaddubsw  xmm1, xmm7
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm3, xmm7
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    sub        ecx,  16
-    movdqa     [edx], xmm0
-
-    movdqa     xmm0, [eax]          // V
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pmaddubsw  xmm0, xmm6
-    pmaddubsw  xmm1, xmm6
-    pmaddubsw  xmm2, xmm6
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm1
-    phaddw     xmm2, xmm3
-    psraw      xmm0, 8
-    psraw      xmm2, 8
-    packsswb   xmm0, xmm2
-    paddb      xmm0, xmm5
-    lea        eax,  [eax + 64]
-    movdqa     [edx + edi], xmm0
-    lea        edx,  [edx + 16]
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
-                                    uint8* dst_u, uint8* dst_v, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
     movdqa     xmm6, kARGBToV
-    movdqa     xmm5, kAddUV128
+    movdqa     xmm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* convert to U and V */
     movdqu     xmm0, [eax]          // U
@@ -1639,7 +1294,6 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
     psraw      xmm2, 8
     packsswb   xmm0, xmm2
     paddb      xmm0, xmm5
-    sub        ecx,  16
     movdqu     [edx], xmm0
 
     movdqu     xmm0, [eax]          // V
@@ -1659,6 +1313,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
     lea        eax,  [eax + 64]
     movdqu     [edx + edi], xmm0
     lea        edx,  [edx + 16]
+    sub        ecx,  16
     jg         convertloop
 
     pop        edi
@@ -1675,18 +1330,17 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
     mov        edx, [esp + 4 + 8]   // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kARGBToV
+    movdqa     xmm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1714,10 +1368,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1726,26 +1380,36 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
 }
 
 __declspec(naked) __declspec(align(16))
-void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
-                                    uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
   __asm {
+    push       esi
     push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
-    mov        edi, [esp + 4 + 12]  // dst_v
-    mov        ecx, [esp + 4 + 16]  // pix
-    movdqa     xmm7, kARGBToU
-    movdqa     xmm6, kARGBToV
+    mov        eax, [esp + 8 + 4]   // src_argb
+    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // pix
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kBGRAToV
+    movdqa     xmm7, kBGRAToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
     movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
     movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
     movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1773,19 +1437,20 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
+    pop        esi
     ret
   }
 }
 
 __declspec(naked) __declspec(align(16))
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                        uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
@@ -1795,22 +1460,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kBGRAToU
-    movdqa     xmm6, kBGRAToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kABGRToV
+    movdqa     xmm7, kABGRToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm4, [eax + esi]
+    pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm4, [eax + esi + 16]
+    pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm4, [eax + esi + 32]
+    pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
+    movdqu     xmm4, [eax + esi + 48]
+    pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1838,10 +1507,10 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1851,8 +1520,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 }
 
 __declspec(naked) __declspec(align(16))
-void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+                       uint8* dst_u, uint8* dst_v, int width) {
   __asm {
     push       esi
     push       edi
@@ -1861,26 +1530,26 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kBGRAToU
-    movdqa     xmm6, kBGRAToV
     movdqa     xmm5, kAddUV128
+    movdqa     xmm6, kRGBAToV
+    movdqa     xmm7, kRGBAToU
     sub        edi, edx             // stride from u to v
 
-    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
+    movdqu     xmm1, [eax + 16]
     movdqu     xmm4, [eax + esi + 16]
     pavgb      xmm1, xmm4
+    movdqu     xmm2, [eax + 32]
     movdqu     xmm4, [eax + esi + 32]
     pavgb      xmm2, xmm4
+    movdqu     xmm3, [eax + 48]
     movdqu     xmm4, [eax + esi + 48]
     pavgb      xmm3, xmm4
+
     lea        eax,  [eax + 64]
     movdqa     xmm4, xmm0
     shufps     xmm0, xmm1, 0x88
@@ -1908,10 +1577,10 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     paddb      xmm0, xmm5            // -> unsigned
 
     // step 3 - store 8 U and 8 V values
-    sub        ecx, 16
     movlps     qword ptr [edx], xmm0 // U
     movhps     qword ptr [edx + edi], xmm0 // V
     lea        edx, [edx + 8]
+    sub        ecx, 16
     jg         convertloop
 
     pop        edi
@@ -1919,314 +1588,263 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     ret
   }
 }
+#endif  // HAS_ARGBTOYROW_SSSE3
 
+// Read 8 UV from 422, upsample to 16 UV.
+#define READYUV422_AVX2 __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi]        /* U */         /* NOLINT */ \
+    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */         /* NOLINT */ \
+    __asm lea        esi,  [esi + 8]                                           \
+    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Read 8 UV from NV12, upsample to 16 UV.
+#define READNV12_AVX2 __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+    __asm lea        esi,  [esi + 16]                                          \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
+  }
+
+// Convert 16 pixels: 16 UV and 16 Y.
+#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+    /* Step 1: Find 8 UV contributions to 16 R,G,B values */                   \
+    __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR        /* scale R UV */   \
+    __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG        /* scale G UV */   \
+    __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB        /* scale B UV */   \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasR                               \
+    __asm vpsubw     ymm2, ymm3, ymm2                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasG                               \
+    __asm vpsubw     ymm1, ymm3, ymm1                                          \
+    __asm vmovdqu    ymm3, YuvConstants.kUVBiasB                               \
+    __asm vpsubw     ymm0, ymm3, ymm0                                          \
+    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vmovdqu    xmm3, [eax]                  /* NOLINT */                 \
+    __asm lea        eax, [eax + 16]                                           \
+    __asm vpermq     ymm3, ymm3, 0xd8                                          \
+    __asm vpunpcklbw ymm3, ymm3, ymm3                                          \
+    __asm vpmulhuw   ymm3, ymm3, YuvConstants.kYToRgb                          \
+    __asm vpaddsw    ymm0, ymm0, ymm3           /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm3           /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm3           /* R += Y */                   \
+    __asm vpsraw     ymm0, ymm0, 6                                             \
+    __asm vpsraw     ymm1, ymm1, 6                                             \
+    __asm vpsraw     ymm2, ymm2, 6                                             \
+    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+  }
+
+// Store 16 ARGB values.
+#define STOREARGB_AVX2 __asm {                                                 \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+    __asm vpermq     ymm0, ymm0, 0xd8                                          \
+    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpermq     ymm2, ymm2, 0xd8                                          \
+    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vmovdqu    [edx], ymm1                                               \
+    __asm vmovdqu    [edx + 32], ymm0                                          \
+    __asm lea        edx,  [edx + 64]                                          \
+  }
+
+#ifdef HAS_I422TOARGBROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 __declspec(naked) __declspec(align(16))
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kABGRToU
-    movdqa     xmm6, kABGRToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      4
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
 
-    // step 3 - store 8 U and 8 V values
     sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
     jg         convertloop
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_I422TOARGBROW_AVX2
 
+#ifdef HAS_NV12TOARGBROW_AVX2
+// 16 pixels.
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
 __declspec(naked) __declspec(align(16))
-void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
+void NV12ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
   __asm {
     push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kABGRToU
-    movdqa     xmm6, kABGRToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      4
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+    STOREARGB_AVX2
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    sub        ecx, 16
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_NV12TOARGBROW_AVX2
+
+#ifdef HAS_NV21TOARGBROW_AVX2
+// 16 pixels.
+// 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_AVX2(const uint8* y_buf,
+                        const uint8* uv_buf,
+                        uint8* dst_argb,
+                        int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // Y
+    mov        esi, [esp + 4 + 8]   // UV
+    mov        edx, [esp + 4 + 12]  // argb
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+
+ convertloop:
+    READNV12_AVX2
+    YUVTORGB_AVX2(kYvuConstants)
+    STOREARGB_AVX2
 
-    // step 3 - store 8 U and 8 V values
     sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
     jg         convertloop
 
-    pop        edi
     pop        esi
     ret
   }
 }
+#endif  // HAS_NV21TOARGBROW_AVX2
 
+#ifdef HAS_I422TOBGRAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
 __declspec(naked) __declspec(align(16))
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void I422ToBGRARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kRGBAToU
-    movdqa     xmm6, kRGBAToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      4
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
-    pavgb      xmm0, [eax + esi]
-    pavgb      xmm1, [eax + esi + 16]
-    pavgb      xmm2, [eax + esi + 32]
-    pavgb      xmm3, [eax + esi + 48]
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
-
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
 
-    // step 3 - store 8 U and 8 V values
+    // Step 3: Weave into BGRA
+    vpunpcklbw ymm1, ymm1, ymm0           // GB
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm2           // AR
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ARGB first 8 pixels
+    vpunpckhwd ymm2, ymm2, ymm1           // ARGB next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm2
+    lea        edx,  [edx + 64]
     sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
     jg         convertloop
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_I422TOBGRAROW_AVX2
 
+#ifdef HAS_I422TORGBAROW_AVX2
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
 __declspec(naked) __declspec(align(16))
-void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                                 uint8* dst_u, uint8* dst_v, int width) {
+void I422ToRGBARow_AVX2(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* dst_argb,
+                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
-    mov        edx, [esp + 8 + 12]  // dst_u
-    mov        edi, [esp + 8 + 16]  // dst_v
-    mov        ecx, [esp + 8 + 20]  // pix
-    movdqa     xmm7, kRGBAToU
-    movdqa     xmm6, kRGBAToV
-    movdqa     xmm5, kAddUV128
-    sub        edi, edx             // stride from u to v
+    mov        eax, [esp + 8 + 4]   // Y
+    mov        esi, [esp + 8 + 8]   // U
+    mov        edi, [esp + 8 + 12]  // V
+    mov        edx, [esp + 8 + 16]  // argb
+    mov        ecx, [esp + 8 + 20]  // width
+    sub        edi, esi
+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
 
-    align      4
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + 32]
-    movdqu     xmm3, [eax + 48]
-    movdqu     xmm4, [eax + esi]
-    pavgb      xmm0, xmm4
-    movdqu     xmm4, [eax + esi + 16]
-    pavgb      xmm1, xmm4
-    movdqu     xmm4, [eax + esi + 32]
-    pavgb      xmm2, xmm4
-    movdqu     xmm4, [eax + esi + 48]
-    pavgb      xmm3, xmm4
-    lea        eax,  [eax + 64]
-    movdqa     xmm4, xmm0
-    shufps     xmm0, xmm1, 0x88
-    shufps     xmm4, xmm1, 0xdd
-    pavgb      xmm0, xmm4
-    movdqa     xmm4, xmm2
-    shufps     xmm2, xmm3, 0x88
-    shufps     xmm4, xmm3, 0xdd
-    pavgb      xmm2, xmm4
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    pmaddubsw  xmm0, xmm7  // U
-    pmaddubsw  xmm2, xmm7
-    pmaddubsw  xmm1, xmm6  // V
-    pmaddubsw  xmm3, xmm6
-    phaddw     xmm0, xmm2
-    phaddw     xmm1, xmm3
-    psraw      xmm0, 8
-    psraw      xmm1, 8
-    packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
-
-    // step 3 - store 8 U and 8 V values
+    // Step 3: Weave into RGBA
+    vpunpcklbw ymm1, ymm1, ymm2           // GR
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm5, ymm0           // AB
+    vpermq     ymm2, ymm2, 0xd8
+    vpunpcklwd ymm0, ymm2, ymm1           // ABGR first 8 pixels
+    vpunpckhwd ymm1, ymm2, ymm1           // ABGR next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
     sub        ecx, 16
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
-    lea        edx, [edx + 8]
     jg         convertloop
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
-#endif  // HAS_ARGBTOYROW_SSSE3
-
-#ifdef HAS_I422TOARGBROW_AVX2
-
-static const lvec8 kUVToB_AVX = {
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
-  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-static const lvec8 kUVToR_AVX = {
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
-  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-static const lvec8 kUVToG_AVX = {
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-static const lvec16 kYToRgb_AVX = {
-  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
-};
-static const lvec16 kYSub16_AVX = {
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-};
-static const lvec16 kUVBiasB_AVX = {
-  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
-};
-static const lvec16 kUVBiasG_AVX = {
-  BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
-};
-static const lvec16 kUVBiasR_AVX = {
-  BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
-};
+#endif  // HAS_I422TORGBAROW_AVX2
 
+#ifdef HAS_I422TOABGRROW_AVX2
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes).
+// TODO(fbarchard): Use macros to reduce duplicate code.  See SSSE3.
 __declspec(naked) __declspec(align(16))
-void I422ToARGBRow_AVX2(const uint8* y_buf,
+void I422ToABGRRow_AVX2(const uint8* y_buf,
                         const uint8* u_buf,
                         const uint8* v_buf,
                         uint8* dst_argb,
@@ -2241,63 +1859,33 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-    vpxor      ymm4, ymm4, ymm4
 
-    align      4
  convertloop:
-    vmovq      xmm0, qword ptr [esi]          //  U
-    vmovq      xmm1, qword ptr [esi + edi]    //  V
-    lea        esi,  [esi + 8]
-    vpunpcklbw ymm0, ymm0, ymm1               // UV
-    vpermq     ymm0, ymm0, 0xd8
-    vpunpcklwd ymm0, ymm0, ymm0              // UVUV
-    vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV
-    vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV
-    vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV
-    vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed
-    vpsubw     ymm1, ymm1, kUVBiasG_AVX
-    vpsubw     ymm0, ymm0, kUVBiasR_AVX
-
-    // Step 2: Find Y contribution to 16 R,G,B values
-    vmovdqu    xmm3, [eax]                  // NOLINT
-    lea        eax, [eax + 16]
-    vpermq     ymm3, ymm3, 0xd8
-    vpunpcklbw ymm3, ymm3, ymm4
-    vpsubsw    ymm3, ymm3, kYSub16_AVX
-    vpmullw    ymm3, ymm3, kYToRgb_AVX
-    vpaddsw    ymm2, ymm2, ymm3           // B += Y
-    vpaddsw    ymm1, ymm1, ymm3           // G += Y
-    vpaddsw    ymm0, ymm0, ymm3           // R += Y
-    vpsraw     ymm2, ymm2, 6
-    vpsraw     ymm1, ymm1, 6
-    vpsraw     ymm0, ymm0, 6
-    vpackuswb  ymm2, ymm2, ymm2           // B
-    vpackuswb  ymm1, ymm1, ymm1           // G
-    vpackuswb  ymm0, ymm0, ymm0           // R
-
-    // Step 3: Weave into ARGB
-    vpunpcklbw ymm2, ymm2, ymm1           // BG
+    READYUV422_AVX2
+    YUVTORGB_AVX2(kYuvConstants)
+
+    // Step 3: Weave into ABGR
+    vpunpcklbw ymm1, ymm2, ymm1           // RG
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklbw ymm2, ymm0, ymm5           // BA
     vpermq     ymm2, ymm2, 0xd8
-    vpunpcklbw ymm0, ymm0, ymm5           // RA
-    vpermq     ymm0, ymm0, 0xd8
-    vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels
-    vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels
-    vmovdqu    [edx], ymm1
-    vmovdqu    [edx + 32], ymm2
+    vpunpcklwd ymm0, ymm1, ymm2           // RGBA first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm2           // RGBA next 8 pixels
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
     lea        edx,  [edx + 64]
     sub        ecx, 16
     jg         convertloop
-    vzeroupper
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
-#endif  // HAS_I422TOARGBROW_AVX2
-
-#ifdef HAS_I422TOARGBROW_SSSE3
+#endif  // HAS_I422TOABGRROW_AVX2
 
+#if defined(HAS_I422TOARGBROW_SSSE3)
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 
 // Read 8 UV from 444.
@@ -2337,22 +1925,25 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
   }
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB __asm {                                                       \
+#define YUVTORGB(YuvConstants) __asm {                                         \
     /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
     __asm movdqa     xmm1, xmm0                                                \
     __asm movdqa     xmm2, xmm0                                                \
-    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
-    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
-    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
-    __asm psubw      xmm1, kUVBiasG                                            \
-    __asm psubw      xmm2, kUVBiasR                                            \
+    __asm movdqa     xmm3, xmm0                                                \
+    __asm movdqa     xmm0, YuvConstants.kUVBiasB /* unbias back to signed */   \
+    __asm pmaddubsw  xmm1, YuvConstants.kUVToB   /* scale B UV */              \
+    __asm psubw      xmm0, xmm1                                                \
+    __asm movdqa     xmm1, YuvConstants.kUVBiasG                               \
+    __asm pmaddubsw  xmm2, YuvConstants.kUVToG   /* scale G UV */              \
+    __asm psubw      xmm1, xmm2                                                \
+    __asm movdqa     xmm2, YuvConstants.kUVBiasR                               \
+    __asm pmaddubsw  xmm3, YuvConstants.kUVToR   /* scale R UV */              \
+    __asm psubw      xmm2, xmm3                                                \
     /* Step 2: Find Y contribution to 8 R,G,B values */                        \
     __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
     __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
-    __asm pmullw     xmm3, kYToRgb                                             \
+    __asm punpcklbw  xmm3, xmm3                                                \
+    __asm pmulhuw    xmm3, YuvConstants.kYToRgb                                \
     __asm paddsw     xmm0, xmm3           /* B += Y */                         \
     __asm paddsw     xmm1, xmm3           /* G += Y */                         \
     __asm paddsw     xmm2, xmm3           /* R += Y */                         \
@@ -2364,35 +1955,131 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
     __asm packuswb   xmm2, xmm2           /* R */                              \
   }
 
-// Convert 8 pixels: 8 VU and 8 Y.
-#define YVUTORGB __asm {                                                       \
-    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
+// Store 8 ARGB values.
+#define STOREARGB __asm {                                                      \
+    /* Step 3: Weave into ARGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm movdqa     xmm2, xmm0                                                \
-    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
-    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
-    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
-    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
-    __asm psubw      xmm1, kUVBiasG                                            \
-    __asm psubw      xmm2, kUVBiasR                                            \
-    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
-    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
-    __asm lea        eax, [eax + 8]                                            \
-    __asm punpcklbw  xmm3, xmm4                                                \
-    __asm psubsw     xmm3, kYSub16                                             \
-    __asm pmullw     xmm3, kYToRgb                                             \
-    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
-    __asm psraw      xmm0, 6                                                   \
-    __asm psraw      xmm1, 6                                                   \
-    __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
-  }
-
-// 8 pixels, dest aligned 16.
+    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm movdqu     [edx], xmm0                                               \
+    __asm movdqu     [edx + 16], xmm1                                          \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 BGRA values.
+#define STOREBGRA __asm {                                                      \
+    /* Step 3: Weave into BGRA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm movdqu     [edx], xmm5                                               \
+    __asm movdqu     [edx + 16], xmm0                                          \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 ABGR values.
+#define STOREABGR __asm {                                                      \
+    /* Step 3: Weave into ABGR */                                              \
+    __asm punpcklbw  xmm2, xmm1           /* RG */                             \
+    __asm punpcklbw  xmm0, xmm5           /* BA */                             \
+    __asm movdqa     xmm1, xmm2                                                \
+    __asm punpcklwd  xmm2, xmm0           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm0           /* RGBA next 4 pixels */             \
+    __asm movdqu     [edx], xmm2                                               \
+    __asm movdqu     [edx + 16], xmm1                                          \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGBA values.
+#define STORERGBA __asm {                                                      \
+    /* Step 3: Weave into RGBA */                                              \
+    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+    __asm movdqa     xmm0, xmm5                                                \
+    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm movdqu     [edx], xmm5                                               \
+    __asm movdqu     [edx + 16], xmm0                                          \
+    __asm lea        edx,  [edx + 32]                                          \
+  }
+
+// Store 8 RGB24 values.
+#define STORERGB24 __asm {                                                     \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB24 */                                                \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr [edx], xmm0  /* First 8 bytes */                \
+    __asm movdqu     [edx + 8], xmm1      /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RAW values.
+#define STORERAW __asm {                                                       \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RAW */                                                  \
+    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr [edx], xmm0  /* First 8 bytes */                \
+    __asm movdqu     [edx + 8], xmm1      /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]                                          \
+  }
+
+// Store 8 RGB565 values.
+#define STORERGB565 __asm {                                                    \
+    /* Step 3: Weave into RRGB */                                              \
+    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+    __asm movdqa     xmm1, xmm0                                                \
+    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
+    /* Step 4: RRGB -> RGB565 */                                               \
+    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0    /* G */                                     \
+    __asm pslld      xmm0, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm0, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm0, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm0, xmm3    /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1    /* G */                                     \
+    __asm pslld      xmm1, 8       /* R */                                     \
+    __asm psrld      xmm3, 3       /* B */                                     \
+    __asm psrld      xmm2, 5       /* G */                                     \
+    __asm psrad      xmm1, 16      /* R */                                     \
+    __asm pand       xmm3, xmm5    /* B */                                     \
+    __asm pand       xmm2, xmm6    /* G */                                     \
+    __asm pand       xmm1, xmm7    /* R */                                     \
+    __asm por        xmm3, xmm2    /* BG */                                    \
+    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm packssdw   xmm0, xmm1                                                \
+    __asm movdqu     [edx], xmm0   /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]                                           \
+  }
+
+// 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void I444ToARGBRow_SSSE3(const uint8* y_buf,
@@ -2410,22 +2097,12 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV444
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2435,8 +2112,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
                           const uint8* u_buf,
@@ -2452,27 +2129,14 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
     mov        edx, [esp + 8 + 16]  // rgb24
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
     movdqa     xmm5, kShuffleMaskARGBToRGB24_0
     movdqa     xmm6, kShuffleMaskARGBToRGB24
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STORERGB24
 
-    // Step 3: Weave into RRGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm2           // RR
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
-    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
-    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
-    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
-    movq       qword ptr [edx], xmm0  // First 8 bytes
-    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
-    lea        edx,  [edx + 24]
     sub        ecx, 8
     jg         convertloop
 
@@ -2482,8 +2146,8 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToRAWRow_SSSE3(const uint8* y_buf,
                         const uint8* u_buf,
@@ -2499,27 +2163,14 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
     mov        edx, [esp + 8 + 16]  // raw
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
     movdqa     xmm5, kShuffleMaskARGBToRAW_0
     movdqa     xmm6, kShuffleMaskARGBToRAW
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STORERAW
 
-    // Step 3: Weave into RRGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm2           // RR
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
-    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.
-    pshufb     xmm1, xmm6           // Pack into first 12 bytes.
-    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1
-    movq       qword ptr [edx], xmm0  // First 8 bytes
-    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.
-    lea        edx,  [edx + 24]
     sub        ecx, 8
     jg         convertloop
 
@@ -2529,8 +2180,8 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels, dest unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
                            const uint8* u_buf,
@@ -2546,7 +2197,6 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
     mov        edx, [esp + 8 + 16]  // rgb565
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
     pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
     psrld      xmm5, 27
     pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
@@ -2555,45 +2205,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
     pslld      xmm7, 11
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STORERGB565
 
-    // Step 3: Weave into RRGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm2           // RR
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels
-
-    // Step 3b: RRGB -> RGB565
-    movdqa     xmm3, xmm0    // B  first 4 pixels of argb
-    movdqa     xmm2, xmm0    // G
-    pslld      xmm0, 8       // R
-    psrld      xmm3, 3       // B
-    psrld      xmm2, 5       // G
-    psrad      xmm0, 16      // R
-    pand       xmm3, xmm5    // B
-    pand       xmm2, xmm6    // G
-    pand       xmm0, xmm7    // R
-    por        xmm3, xmm2    // BG
-    por        xmm0, xmm3    // BGR
-    movdqa     xmm3, xmm1    // B  next 4 pixels of argb
-    movdqa     xmm2, xmm1    // G
-    pslld      xmm1, 8       // R
-    psrld      xmm3, 3       // B
-    psrld      xmm2, 5       // G
-    psrad      xmm1, 16      // R
-    pand       xmm3, xmm5    // B
-    pand       xmm2, xmm6    // G
-    pand       xmm1, xmm7    // R
-    por        xmm3, xmm2    // BG
-    por        xmm1, xmm3    // BGR
-    packssdw   xmm0, xmm1
     sub        ecx, 8
-    movdqu     [edx], xmm0   // store 8 pixels of RGB565
-    lea        edx, [edx + 16]
     jg         convertloop
 
     pop        edi
@@ -2602,7 +2219,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels, dest aligned 16.
+// 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void I422ToARGBRow_SSSE3(const uint8* y_buf,
@@ -2620,22 +2237,12 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2645,7 +2252,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels, dest aligned 16.
+// 8 pixels.
 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 // Similar to I420 but duplicate UV once more.
 __declspec(naked) __declspec(align(16))
@@ -2664,23 +2271,13 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
     mov        edx, [esp + 12 + 16]  // argb
     mov        ecx, [esp + 12 + 20]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
+    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
 
-    align      4
  convertloop:
     READYUV411  // modifies EBX
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2691,7 +2288,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels, dest aligned 16.
+// 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
@@ -2705,22 +2302,12 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
     mov        edx, [esp + 4 + 12]  // argb
     mov        ecx, [esp + 4 + 16]  // width
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READNV12
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2729,8 +2316,8 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// 8 pixels.
+// 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
                          const uint8* uv_buf,
@@ -2739,234 +2326,16 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // VU
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READNV12
-    YVUTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels, unaligned.
-// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV444
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels, unaligned.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels, unaligned.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked) __declspec(align(16))
-void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // argb
-    mov        ecx, [esp + 12 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV411  // modifies EBX
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* uv_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // Y
     mov        esi, [esp + 4 + 8]   // UV
     mov        edx, [esp + 4 + 12]  // argb
     mov        ecx, [esp + 4 + 16]  // width
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READNV12
-    YUVTORGB
+    YUVTORGB(kYvuConstants)
+    STOREARGB
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        esi
-    ret
-  }
-}
-
-// 8 pixels, dest aligned 16.
-// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked) __declspec(align(16))
-void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* uv_buf,
-                                   uint8* dst_argb,
-                                   int width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]   // Y
-    mov        esi, [esp + 4 + 8]   // VU
-    mov        edx, [esp + 4 + 12]  // argb
-    mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READNV12
-    YVUTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm0, xmm1           // BG
-    punpcklbw  xmm2, xmm5           // RA
-    movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
-    movdqu     [edx], xmm0
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -2990,64 +2359,12 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
     mov        edx, [esp + 8 + 16]  // bgra
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into BGRA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm0           // GB
-    punpcklbw  xmm5, xmm2           // AR
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
-    movdqa     [edx], xmm5
-    movdqa     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_bgra,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // bgra
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREBGRA
 
-    // Step 3: Weave into BGRA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm0           // GB
-    punpcklbw  xmm5, xmm2           // AR
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
-    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
-    movdqu     [edx], xmm5
-    movdqu     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -3073,63 +2390,12 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
+    YUVTORGB(kYuvConstants)
+    STOREABGR
 
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm2, xmm1           // RG
-    punpcklbw  xmm0, xmm5           // BA
-    movdqa     xmm1, xmm2
-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_abgr,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // abgr
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into ARGB
-    punpcklbw  xmm2, xmm1           // RG
-    punpcklbw  xmm0, xmm5           // BA
-    movdqa     xmm1, xmm2
-    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
-    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
-    movdqu     [edx], xmm2
-    movdqu     [edx + 16], xmm1
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -3154,64 +2420,12 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
     mov        edx, [esp + 8 + 16]  // rgba
     mov        ecx, [esp + 8 + 20]  // width
     sub        edi, esi
-    pxor       xmm4, xmm4
 
-    align      4
  convertloop:
     READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into RGBA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm2           // GR
-    punpcklbw  xmm5, xmm0           // AB
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
-    movdqa     [edx], xmm5
-    movdqa     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
+    YUVTORGB(kYuvConstants)
+    STORERGBA
 
-__declspec(naked) __declspec(align(16))
-void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
-                                   const uint8* u_buf,
-                                   const uint8* v_buf,
-                                   uint8* dst_rgba,
-                                   int width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // rgba
-    mov        ecx, [esp + 8 + 20]  // width
-    sub        edi, esi
-    pxor       xmm4, xmm4
-
-    align      4
- convertloop:
-    READYUV422
-    YUVTORGB
-
-    // Step 3: Weave into RGBA
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
-    punpcklbw  xmm1, xmm2           // GR
-    punpcklbw  xmm5, xmm0           // AB
-    movdqa     xmm0, xmm5
-    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
-    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
-    movdqu     [edx], xmm5
-    movdqu     [edx + 16], xmm0
-    lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -3224,32 +2438,32 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 #ifdef HAS_YTOARGBROW_SSE2
+// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
 __declspec(naked) __declspec(align(16))
 void YToARGBRow_SSE2(const uint8* y_buf,
                      uint8* rgb_buf,
                      int width) {
   __asm {
-    pxor       xmm5, xmm5
-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
-    pslld      xmm4, 24
-    mov        eax, 0x00100010
-    movd       xmm3, eax
-    pshufd     xmm3, xmm3, 0
-    mov        eax, 0x004a004a       // 74
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
     movd       xmm2, eax
     pshufd     xmm2, xmm2,0
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    movd       xmm3, eax
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pslld      xmm4, 24
+
     mov        eax, [esp + 4]       // Y
     mov        edx, [esp + 8]       // rgb
     mov        ecx, [esp + 12]      // width
 
-    align      4
  convertloop:
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
     lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm5           // 0.Y
+    punpcklbw  xmm0, xmm0           // Y.Y
+    pmulhuw    xmm0, xmm2
     psubusw    xmm0, xmm3
-    pmullw     xmm0, xmm2
     psrlw      xmm0, 6
     packuswb   xmm0, xmm0           // G
 
@@ -3260,23 +2474,74 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
     por        xmm0, xmm4
     por        xmm1, xmm4
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx,  [edx + 32]
     sub        ecx, 8
     jg         convertloop
-
     ret
   }
 }
 #endif  // HAS_YTOARGBROW_SSE2
 
+#ifdef HAS_YTOARGBROW_AVX2
+// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
+// note: vpunpcklbw mutates and vpackuswb unmutates.
+__declspec(naked) __declspec(align(16))
+void YToARGBRow_AVX2(const uint8* y_buf,
+                     uint8* rgb_buf,
+                     int width) {
+  __asm {
+    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    vmovd      xmm2, eax
+    vbroadcastss ymm2, xmm2
+    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    vmovd      xmm3, eax
+    vbroadcastss ymm3, xmm3
+    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpslld     ymm4, ymm4, 24
+
+    mov        eax, [esp + 4]       // Y
+    mov        edx, [esp + 8]       // rgb
+    mov        ecx, [esp + 12]      // width
+
+ convertloop:
+    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+    vmovdqu    xmm0, [eax]
+    lea        eax, [eax + 16]
+    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpmulhuw   ymm0, ymm0, ymm2
+    vpsubusw   ymm0, ymm0, ymm3
+    vpsrlw     ymm0, ymm0, 6
+    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+
+    // TODO(fbarchard): Weave alpha with unpack.
+    // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+    vpermq     ymm1, ymm1, 0xd8
+    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpor       ymm0, ymm0, ymm4
+    vpor       ymm1, ymm1, ymm4
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx,  [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_YTOARGBROW_AVX2
+
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
 static const uvec8 kShuffleMirror = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
+// TODO(fbarchard): Replace lea with -16 offset.
 __declspec(naked) __declspec(align(16))
 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   __asm {
@@ -3284,15 +2549,13 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, kShuffleMirror
-    lea       eax, [eax - 16]
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax + ecx]
+    movdqu    xmm0, [eax - 16 + ecx]
     pshufb    xmm0, xmm5
-    sub       ecx, 16
-    movdqa    [edx], xmm0
+    movdqu    [edx], xmm0
     lea       edx, [edx + 16]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
@@ -3300,29 +2563,21 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-// Shuffle table for reversing the bytes.
-static const ulvec8 kShuffleMirror_AVX2 = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-
 __declspec(naked) __declspec(align(16))
 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    vmovdqa   ymm5, kShuffleMirror_AVX2
-    lea       eax, [eax - 32]
+    vbroadcastf128 ymm5, kShuffleMirror
 
-    align      4
  convertloop:
-    vmovdqu   ymm0, [eax + ecx]
+    vmovdqu   ymm0, [eax - 32 + ecx]
     vpshufb   ymm0, ymm0, ymm5
     vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs
-    sub       ecx, 32
     vmovdqu   [edx], ymm0
     lea       edx, [edx + 32]
+    sub       ecx, 32
     jg        convertloop
     vzeroupper
     ret
@@ -3331,19 +2586,15 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_MIRRORROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSE2
-// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
-// version can not.
 __declspec(naked) __declspec(align(16))
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 16]
 
-    align      4
  convertloop:
-    movdqu    xmm0, [eax + ecx]
+    movdqu    xmm0, [eax - 16 + ecx]
     movdqa    xmm1, xmm0        // swap bytes
     psllw     xmm0, 8
     psrlw     xmm1, 8
@@ -3351,9 +2602,9 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
     pshuflw   xmm0, xmm0, 0x1b  // swap words
     pshufhw   xmm0, xmm0, 0x1b
     pshufd    xmm0, xmm0, 0x4e  // swap qwords
-    sub       ecx, 16
     movdqu    [edx], xmm0
     lea       edx, [edx + 16]
+    sub       ecx, 16
     jg        convertloop
     ret
   }
@@ -3379,15 +2630,14 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     lea       eax, [eax + ecx * 2 - 16]
     sub       edi, edx
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]
+    movdqu    xmm0, [eax]
     lea       eax, [eax - 16]
     pshufb    xmm0, xmm1
-    sub       ecx, 8
     movlpd    qword ptr [edx], xmm0
     movhpd    qword ptr [edx + edi], xmm0
     lea       edx, [edx + 8]
+    sub       ecx, 8
     jg        convertloop
 
     pop       edi
@@ -3396,34 +2646,27 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 }
 #endif  // HAS_MIRRORROW_UV_SSSE3
 
-#ifdef HAS_ARGBMIRRORROW_SSSE3
-// Shuffle table for reversing the bytes.
-static const uvec8 kARGBShuffleMirror = {
-  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
-};
-
+#ifdef HAS_ARGBMIRRORROW_SSE2
 __declspec(naked) __declspec(align(16))
-void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   __asm {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
-    movdqa    xmm5, kARGBShuffleMirror
 
-    align      4
  convertloop:
-    movdqa    xmm0, [eax]
+    movdqu    xmm0, [eax]
     lea       eax, [eax - 16]
-    pshufb    xmm0, xmm5
-    sub       ecx, 4
-    movdqa    [edx], xmm0
+    pshufd    xmm0, xmm0, 0x1b
+    movdqu    [edx], xmm0
     lea       edx, [edx + 16]
+    sub       ecx, 4
     jg        convertloop
     ret
   }
 }
-#endif  // HAS_ARGBMIRRORROW_SSSE3
+#endif  // HAS_ARGBMIRRORROW_SSE2
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
@@ -3437,15 +2680,13 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
-    lea       eax, [eax - 32]
-    vmovdqa   ymm5, kARGBShuffleMirror_AVX2
+    vmovdqu   ymm5, kARGBShuffleMirror_AVX2
 
-    align      4
  convertloop:
-    vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
-    sub       ecx, 8
+    vpermd    ymm0, ymm5, [eax - 32 + ecx * 4]  // permute dword order
     vmovdqu   [edx], ymm0
     lea       edx, [edx + 32]
+    sub       ecx, 8
     jg        convertloop
     vzeroupper
     ret
@@ -3466,44 +2707,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm0
-    movdqa     xmm3, xmm1
-    pand       xmm0, xmm5   // even bytes
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm2, 8      // odd bytes
-    psrlw      xmm3, 8
-    packuswb   xmm2, xmm3
-    movdqa     [edx], xmm0
-    movdqa     [edx + edi], xmm2
-    lea        edx, [edx + 16]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                               int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -3526,6 +2729,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     ret
   }
 }
+
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
@@ -3541,7 +2745,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3579,37 +2782,6 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     mov        ecx, [esp + 4 + 16]   // width
     sub        edx, eax
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]      // read 16 U's
-    movdqa     xmm1, [eax + edx]  // and 16 V's
-    lea        eax,  [eax + 16]
-    movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm2
-    lea        edi, [edi + 32]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
-                               uint8* dst_uv, int width) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
-    sub        edx, eax
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]      // read 16 U's
     movdqu     xmm1, [eax + edx]  // and 16 V's
@@ -3641,17 +2813,16 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     mov        ecx, [esp + 4 + 16]   // width
     sub        edx, eax
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]           // read 32 U's
     vmovdqu    ymm1, [eax + edx]     // and 32 V's
     lea        eax,  [eax + 32]
     vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
     vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
-    vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0
-    vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0
-    vmovdqu    [edi], ymm1
-    vmovdqu    [edi + 32], ymm2
+    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
+    vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
+    vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
     lea        edi, [edi + 64]
     sub        ecx, 32
     jg         convertloop
@@ -3672,13 +2843,12 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
 
-    align      4
   convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax, [eax + 32]
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 32
     jg         convertloop
@@ -3687,39 +2857,46 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_SSE2
 
-// Unaligned Multiple of 1.
+#ifdef HAS_COPYROW_AVX
+// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
 __declspec(naked) __declspec(align(16))
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
   __asm {
-    mov        eax, esi
-    mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
-    rep movsb
-    mov        edi, edx
-    mov        esi, eax
+
+  convertloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    lea        eax, [eax + 64]
+    vmovdqu    [edx], ymm0
+    vmovdqu    [edx + 32], ymm1
+    lea        edx, [edx + 64]
+    sub        ecx, 64
+    jg         convertloop
+
+    vzeroupper
     ret
   }
 }
+#endif  // HAS_COPYROW_AVX
 
-#ifdef HAS_COPYROW_X86
+// Multiple of 1.
 __declspec(naked) __declspec(align(16))
-void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
   __asm {
     mov        eax, esi
     mov        edx, edi
     mov        esi, [esp + 4]   // src
     mov        edi, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
-    shr        ecx, 2
-    rep movsd
+    rep movsb
     mov        edi, edx
     mov        esi, eax
     ret
   }
 }
-#endif  // HAS_COPYROW_X86
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
@@ -3734,21 +2911,20 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
     psrld      xmm1, 8
 
-    align      4
   convertloop:
-    movdqa     xmm2, [eax]
-    movdqa     xmm3, [eax + 16]
+    movdqu     xmm2, [eax]
+    movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
-    movdqa     xmm4, [edx]
-    movdqa     xmm5, [edx + 16]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
     pand       xmm2, xmm0
     pand       xmm3, xmm0
     pand       xmm4, xmm1
     pand       xmm5, xmm1
     por        xmm2, xmm4
     por        xmm3, xmm5
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm3
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
@@ -3769,7 +2945,6 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
-    align      4
   convertloop:
     vmovdqu    ymm1, [eax]
     vmovdqu    ymm2, [eax + 32]
@@ -3801,23 +2976,22 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
     pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
     psrld      xmm1, 8
 
-    align      4
   convertloop:
     movq       xmm2, qword ptr [eax]  // 8 Y's
     lea        eax, [eax + 8]
     punpcklbw  xmm2, xmm2
     punpckhwd  xmm3, xmm2
     punpcklwd  xmm2, xmm2
-    movdqa     xmm4, [edx]
-    movdqa     xmm5, [edx + 16]
+    movdqu     xmm4, [edx]
+    movdqu     xmm5, [edx + 16]
     pand       xmm2, xmm0
     pand       xmm3, xmm0
     pand       xmm4, xmm1
     pand       xmm5, xmm1
     por        xmm2, xmm4
     por        xmm3, xmm5
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm3
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
     lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
@@ -3838,7 +3012,6 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
     vpcmpeqb   ymm0, ymm0, ymm0
     vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
 
-    align      4
   convertloop:
     vpmovzxbd  ymm1, qword ptr [eax]
     vpmovzxbd  ymm2, qword ptr [eax + 8]
@@ -3860,13 +3033,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+// Write 'count' bytes using an 8 bit value repeated.
+// Count should be multiple of 4.
 __declspec(naked) __declspec(align(16))
-void SetRow_X86(uint8* dst, uint32 v32, int count) {
+void SetRow_X86(uint8* dst, uint8 v8, int count) {
   __asm {
+    movzx      eax, byte ptr [esp + 8]    // v8
+    mov        edx, 0x01010101  // Duplicate byte to all bytes.
+    mul        edx              // overwrites edx with upper part of result.
     mov        edx, edi
     mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
     mov        ecx, [esp + 12]  // count
     shr        ecx, 2
     rep stosd
@@ -3875,33 +3051,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) {
   }
 }
 
-// SetRow32 writes 'count' words using a 32 bit value repeated.
+// Write 'count' bytes using an 8 bit value repeated.
 __declspec(naked) __declspec(align(16))
-void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
-                   int dst_stride, int height) {
+void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
   __asm {
-    push       esi
-    push       edi
-    push       ebp
-    mov        edi, [esp + 12 + 4]   // dst
-    mov        eax, [esp + 12 + 8]   // v32
-    mov        ebp, [esp + 12 + 12]  // width
-    mov        edx, [esp + 12 + 16]  // dst_stride
-    mov        esi, [esp + 12 + 20]  // height
-    lea        ecx, [ebp * 4]
-    sub        edx, ecx             // stride - width * 4
-
-    align      4
-  convertloop:
-    mov        ecx, ebp
-    rep stosd
-    add        edi, edx
-    sub        esi, 1
-    jg         convertloop
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v8
+    mov        ecx, [esp + 12]  // count
+    rep stosb
+    mov        edi, edx
+    ret
+  }
+}
 
-    pop        ebp
-    pop        edi
-    pop        esi
+// Write 'count' 32 bit values.
+__declspec(naked) __declspec(align(16))
+void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+  __asm {
+    mov        edx, edi
+    mov        edi, [esp + 4]   // dst
+    mov        eax, [esp + 8]   // v32
+    mov        ecx, [esp + 12]  // count
+    rep stosd
+    mov        edi, edx
     ret
   }
 }
@@ -3918,7 +3091,6 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3927,9 +3099,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
     vpand      ymm1, ymm1, ymm5
     vpackuswb  ymm0, ymm0, ymm1   // mutates.
     vpermq     ymm0, ymm0, 0xd8
-    sub        ecx, 32
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
     vzeroupper
     ret
@@ -3951,7 +3123,6 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3994,7 +3165,6 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4029,7 +3199,6 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4038,12 +3207,12 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
     vpsrlw     ymm1, ymm1, 8
     vpackuswb  ymm0, ymm0, ymm1   // mutates.
     vpermq     ymm0, ymm0, 0xd8
-    sub        ecx, 32
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         convertloop
-    ret
     vzeroupper
+    ret
   }
 }
 
@@ -4062,7 +3231,6 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4105,7 +3273,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -4144,114 +3311,6 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
-                               uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4259,17 +3318,17 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
     pand       xmm0, xmm5   // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
 __declspec(naked) __declspec(align(16))
-void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                                uint8* dst_u, uint8* dst_v, int pix) {
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+                      uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
@@ -4282,7 +3341,6 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4312,8 +3370,8 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
 }
 
 __declspec(naked) __declspec(align(16))
-void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
@@ -4324,7 +3382,6 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4356,112 +3413,6 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         convertloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-    sub        edi, edx
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    movdqa     xmm1, xmm0
-    pand       xmm0, xmm5  // U
-    packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
-    packuswb   xmm1, xmm1
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + edi], xmm1
-    lea        edx, [edx + 8]
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        edi
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
-                               uint8* dst_y, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // pix
-
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4469,17 +3420,17 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
     psrlw      xmm0, 8    // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
     ret
   }
 }
 
 __declspec(naked) __declspec(align(16))
-void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                                uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+                      uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       esi
     push       edi
@@ -4492,7 +3443,6 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4522,8 +3472,8 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
 }
 
 __declspec(naked) __declspec(align(16))
-void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
-                                   uint8* dst_u, uint8* dst_v, int pix) {
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+                         uint8* dst_u, uint8* dst_v, int pix) {
   __asm {
     push       edi
     mov        eax, [esp + 4 + 4]    // src_yuy2
@@ -4534,7 +3484,6 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4607,9 +3556,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        alignloop1
 
   alignloop1b:
@@ -4638,9 +3587,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jge        convertloop4
 
   convertloop4b:
@@ -4669,9 +3618,9 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        convertloop1
 
   convertloop1b:
@@ -4739,48 +3688,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        alignloop1
 
   alignloop1b:
     add        ecx, 1 - 4
     jl         convertloop4b
 
-    test       eax, 15          // unaligned?
-    jne        convertuloop4
-    test       esi, 15          // unaligned?
-    jne        convertuloop4
-
     // 4 pixel loop.
   convertloop4:
-    movdqa     xmm3, [eax]      // src argb
-    lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqa     xmm2, [esi]      // _r_b
-    pshufb     xmm3, kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqa     xmm1, [esi]      // _a_g
-    lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jge        convertloop4
-    jmp        convertloop4b
-
-    // 4 pixel unaligned loop.
-  convertuloop4:
     movdqu     xmm3, [eax]      // src argb
     lea        eax, [eax + 16]
     movdqa     xmm0, xmm3       // src argb
@@ -4799,10 +3717,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
-    jge        convertuloop4
+    sub        ecx, 4
+    jge        convertloop4
 
   convertloop4b:
     add        ecx, 4 - 1
@@ -4828,9 +3746,9 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     paddusb    xmm0, xmm2       // + src argb
     pand       xmm1, xmm5       // a_g_ convert to 8 bits again
     paddusb    xmm0, xmm1       // + src argb
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        convertloop1
 
   convertloop1b:
@@ -4842,7 +3760,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
-// Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   __asm {
@@ -4854,19 +3771,18 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
     psrld      xmm5, 8
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     punpcklbw  xmm0, xmm0       // first 2
     pshufhw    xmm2, xmm0, 0FFh // 8 alpha words
     pshuflw    xmm2, xmm2, 0FFh
     pmulhuw    xmm0, xmm2       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     punpckhbw  xmm1, xmm1       // next 2 pixels
     pshufhw    xmm2, xmm1, 0FFh // 8 alpha words
     pshuflw    xmm2, xmm2, 0FFh
     pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm2, [eax]      // alphas
+    movdqu     xmm2, [eax]      // alphas
     lea        eax, [eax + 16]
     psrlw      xmm0, 8
     pand       xmm2, xmm4
@@ -4874,9 +3790,9 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     packuswb   xmm0, xmm1
     pand       xmm0, xmm5       // keep original alphas
     por        xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     ret
@@ -4904,7 +3820,6 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     movdqa     xmm4, kShuffleAlpha0
     movdqa     xmm5, kShuffleAlpha1
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]      // read 4 pixels
     pshufb     xmm0, xmm4       // isolate first 2 alphas
@@ -4923,9 +3838,9 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     por        xmm0, xmm2       // copy original alpha
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     ret
@@ -4935,11 +3850,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const ulvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
-  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
-  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
+static const uvec8 kShuffleAlpha_AVX2 = {
+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
 };
 __declspec(naked) __declspec(align(16))
 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -4948,11 +3860,10 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vmovdqa    ymm4, kShuffleAlpha_AVX2
+    vbroadcastf128 ymm4,kShuffleAlpha_AVX2
     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
     vpslld     ymm5, ymm5, 24
 
-    align      4
  convertloop:
     vmovdqu    ymm6, [eax]       // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
@@ -4966,9 +3877,9 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
     vpsrlw     ymm1, ymm1, 8
     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
     vpor       ymm0, ymm0, ymm6  // copy original alpha
-    sub        ecx, 8
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
 
     vzeroupper
@@ -4979,7 +3890,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-// Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
@@ -4990,7 +3900,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     mov        edx, [esp + 8 + 8]   // dst_argb
     mov        ecx, [esp + 8 + 12]  // width
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
@@ -5016,9 +3925,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     lea        eax, [eax + 16]
 
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
     pop        edi
     pop        esi
@@ -5029,9 +3938,8 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
+static const uvec8 kUnattenShuffleAlpha_AVX2 = {
+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
 };
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
@@ -5044,9 +3952,8 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
+    vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2
 
-    align      4
  convertloop:
     vmovdqu    ymm6, [eax]       // read 8 pixels.
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
@@ -5061,9 +3968,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    sub        ecx, 8
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
 
     vzeroupper
@@ -5080,12 +3987,11 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
-    vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2
+    vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2
 
     push       esi
     push       edi
 
-    align      4
  convertloop:
     // replace VPGATHER
     movzx      esi, byte ptr [eax + 3]                 // alpha0
@@ -5123,9 +4029,9 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
     vpackuswb  ymm0, ymm0, ymm1  // unmutated.
-    sub        ecx, 8
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
 
     pop        edi
@@ -5148,18 +4054,17 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm0, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm0, xmm1
     paddw      xmm0, xmm5  // Add .5 for rounding.
     psrlw      xmm0, 7
     packuswb   xmm0, xmm0   // 8 G bytes
-    movdqa     xmm2, [eax]  // A
-    movdqa     xmm3, [eax + 16]
+    movdqu     xmm2, [eax]  // A
+    movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
@@ -5171,10 +4076,10 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     movdqa     xmm1, xmm0
     punpcklwd  xmm0, xmm3   // GGGA first 4
     punpckhwd  xmm1, xmm3   // GGGA next 4
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
@@ -5208,32 +4113,31 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     movdqa     xmm3, kARGBToSepiaG
     movdqa     xmm4, kARGBToSepiaR
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // B
-    movdqa     xmm6, [eax + 16]
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm6, [eax + 16]
     pmaddubsw  xmm0, xmm2
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
     packuswb   xmm0, xmm0   // 8 B values
-    movdqa     xmm5, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm5, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
     packuswb   xmm5, xmm5   // 8 G values
     punpcklbw  xmm0, xmm5   // 8 BG values
-    movdqa     xmm5, [eax]  // R
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm5, [eax]  // R
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
     packuswb   xmm5, xmm5   // 8 R values
-    movdqa     xmm6, [eax]  // A
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
     psrld      xmm1, 24
     packuswb   xmm6, xmm1
@@ -5242,10 +4146,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     movdqa     xmm1, xmm0   // Weave BG, RA together
     punpcklwd  xmm0, xmm5   // BGRA first 4
     punpckhwd  xmm1, xmm5   // BGRA next 4
-    sub        ecx, 8
-    movdqa     [eax], xmm0
-    movdqa     [eax + 16], xmm1
+    movdqu     [eax], xmm0
+    movdqu     [eax + 16], xmm1
     lea        eax, [eax + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
@@ -5271,14 +4175,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     pshufd     xmm5, xmm5, 0xff
     mov        ecx, [esp + 16]  /* width */
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // B
-    movdqa     xmm7, [eax + 16]
+    movdqu     xmm0, [eax]  // B
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm0, xmm2
     pmaddubsw  xmm7, xmm2
-    movdqa     xmm6, [eax]  // G
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm6, [eax]  // G
+    movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
     phaddsw    xmm0, xmm7   // B
@@ -5288,13 +4191,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     packuswb   xmm0, xmm0   // 8 B values
     packuswb   xmm6, xmm6   // 8 G values
     punpcklbw  xmm0, xmm6   // 8 BG values
-    movdqa     xmm1, [eax]  // R
-    movdqa     xmm7, [eax + 16]
+    movdqu     xmm1, [eax]  // R
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
     phaddsw    xmm1, xmm7   // R
-    movdqa     xmm6, [eax]  // A
-    movdqa     xmm7, [eax + 16]
+    movdqu     xmm6, [eax]  // A
+    movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
     phaddsw    xmm6, xmm7   // A
@@ -5306,11 +4209,11 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     movdqa     xmm6, xmm0   // Weave BG, RA together
     punpcklwd  xmm0, xmm1   // BGRA first 4
     punpckhwd  xmm6, xmm1   // BGRA next 4
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm6
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm6
     lea        eax, [eax + 32]
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         convertloop
     ret
   }
@@ -5319,7 +4222,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-// Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) {
@@ -5339,25 +4241,24 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
     pslld      xmm6, 24
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]  // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     punpcklbw  xmm0, xmm5   // first 2 pixels
     pmulhuw    xmm0, xmm2   // pixel * scale >> 16
-    movdqa     xmm1, [eax]  // read 4 pixels
+    movdqu     xmm1, [eax]  // read 4 pixels
     punpckhbw  xmm1, xmm5   // next 2 pixels
     pmulhuw    xmm1, xmm2
     pmullw     xmm0, xmm3   // * interval_size
-    movdqa     xmm7, [eax]  // read 4 pixels
+    movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
     pand       xmm7, xmm6   // mask alpha
     paddw      xmm0, xmm4   // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
-    sub        ecx, 4
-    movdqa     [eax], xmm0
+    movdqu     [eax], xmm0
     lea        eax, [eax + 16]
+    sub        ecx, 4
     jg         convertloop
     ret
   }
@@ -5366,7 +4267,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-// Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
@@ -5378,9 +4278,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm0       // first 2
@@ -5390,9 +4289,9 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     ret
@@ -5413,7 +4312,6 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
     movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
@@ -5428,9 +4326,9 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     pop        esi
@@ -5455,16 +4353,15 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     sub        ecx, 4
     jl         convertloop49
 
-    align      4
  convertloop4:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jge        convertloop4
 
  convertloop49:
@@ -5477,9 +4374,9 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     movd       xmm1, [esi]        // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        convertloop1
 
  convertloop19:
@@ -5501,16 +4398,15 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
-    align      4
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
     movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         convertloop
 
     pop        esi
@@ -5532,7 +4428,6 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        ecx, [esp + 4 + 16]  // width
     vpxor      ymm5, ymm5, ymm5     // constant 0
 
-    align      4
  convertloop:
     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
@@ -5569,7 +4464,6 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
@@ -5599,7 +4493,6 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
-    align      4
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
@@ -5638,7 +4531,6 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     sub        edx, eax
     pxor       xmm5, xmm5  // constant 0
 
-    align      4
  convertloop:
     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
@@ -5662,9 +4554,9 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
-    sub        ecx, 8
     movq       qword ptr [eax + edx], xmm0
     lea        eax, [eax + 8]
+    sub        ecx, 8
     jg         convertloop
 
     pop        edi
@@ -5692,7 +4584,6 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     sub        edx, eax
     pxor       xmm5, xmm5  // constant 0
 
-    align      4
  convertloop:
     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
@@ -5716,9 +4607,9 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
-    sub        ecx, 8
     movq       qword ptr [eax + edx], xmm0
     lea        eax, [eax + 8]
+    sub        ecx, 8
     jg         convertloop
 
     pop        esi
@@ -5746,10 +4637,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     pcmpeqb    xmm5, xmm5           // alpha 255
     pslld      xmm5, 24             // 0xff000000
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
     movdqa     xmm2, xmm0             // GG
@@ -5765,12 +4655,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     punpckhwd  xmm0, xmm0             // Last 4
     por        xmm3, xmm5             // GGGA
     por        xmm0, xmm5
-    sub        ecx, 16
-    movdqa     [edx], xmm1
-    movdqa     [edx + 16], xmm2
-    movdqa     [edx + 32], xmm3
-    movdqa     [edx + 48], xmm0
+    movdqu     [edx], xmm1
+    movdqu     [edx + 16], xmm2
+    movdqu     [edx + 32], xmm3
+    movdqu     [edx + 48], xmm0
     lea        edx, [edx + 64]
+    sub        ecx, 16
     jg         convertloop
 
     pop        esi
@@ -5792,15 +4682,14 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    sub        ecx, 16
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         convertloop
 
     pop        esi
@@ -5827,10 +4716,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     sub        esi, eax
     pcmpeqb    xmm5, xmm5           // alpha 255
 
-    align      4
  convertloop:
-    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
     paddusb    xmm2, xmm1             // sobel = sobelx + sobely
@@ -5846,12 +4734,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     movdqa     xmm7, xmm1             // YSXA
     punpcklwd  xmm7, xmm0             // Next 4
     punpckhwd  xmm1, xmm0             // Last 4
-    sub        ecx, 16
-    movdqa     [edx], xmm6
-    movdqa     [edx + 16], xmm4
-    movdqa     [edx + 32], xmm7
-    movdqa     [edx + 48], xmm1
+    movdqu     [edx], xmm6
+    movdqu     [edx + 16], xmm4
+    movdqu     [edx + 32], xmm7
+    movdqu     [edx + 48], xmm1
     lea        edx, [edx + 64]
+    sub        ecx, 16
     jg         convertloop
 
     pop        esi
@@ -5872,8 +4760,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // area is the number of pixels in the area being averaged.
 // dst points to pixel to store result to.
 // count is number of averaged pixels to produce.
-// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
-// aligned.
+// Does 4 pixels at a time.
 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
                                     int width, int area, uint8* dst,
                                     int count) {
@@ -5903,13 +4790,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     packssdw   xmm5, xmm5           // 16 bit shorts
 
     // 4 pixel loop small blocks.
-    align      4
   s4:
     // top left
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
 
     // - top right
     psubd      xmm0, [eax + edx * 4]
@@ -5946,13 +4832,12 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     jmp        l4b
 
     // 4 pixel loop
-    align      4
   l4:
     // top left
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + 32]
-    movdqa     xmm3, [eax + 48]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + 32]
+    movdqu     xmm3, [eax + 48]
 
     // - top right
     psubd      xmm0, [eax + edx * 4]
@@ -5999,9 +4884,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
     lea        eax, [eax + 16]
     psubd      xmm0, [esi]
@@ -6040,7 +4924,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     jne        l4b
 
     // 4 pixel loop
-    align      4
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -6057,26 +4940,26 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     punpckhwd  xmm5, xmm1
 
     paddd      xmm0, xmm2
-    movdqa     xmm2, [esi]  // previous row above.
+    movdqu     xmm2, [esi]  // previous row above.
     paddd      xmm2, xmm0
 
     paddd      xmm0, xmm3
-    movdqa     xmm3, [esi + 16]
+    movdqu     xmm3, [esi + 16]
     paddd      xmm3, xmm0
 
     paddd      xmm0, xmm4
-    movdqa     xmm4, [esi + 32]
+    movdqu     xmm4, [esi + 32]
     paddd      xmm4, xmm0
 
     paddd      xmm0, xmm5
-    movdqa     xmm5, [esi + 48]
+    movdqu     xmm5, [esi + 48]
     lea        esi, [esi + 64]
     paddd      xmm5, xmm0
 
-    movdqa     [edx], xmm2
-    movdqa     [edx + 16], xmm3
-    movdqa     [edx + 32], xmm4
-    movdqa     [edx + 48], xmm5
+    movdqu     [edx], xmm2
+    movdqu     [edx + 16], xmm3
+    movdqu     [edx + 32], xmm4
+    movdqu     [edx + 48], xmm5
 
     lea        edx, [edx + 64]
     sub        ecx, 4
@@ -6087,7 +4970,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
     lea        eax, [eax + 4]
@@ -6142,7 +5024,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     addps      xmm4, xmm4    // dudv *= 4
 
     // 4 pixel loop
-    align      4
   l4:
     cvttps2dq  xmm0, xmm2    // x, y float to int first 2
     cvttps2dq  xmm1, xmm3    // x, y float to int next 2
@@ -6164,9 +5045,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     movd       xmm0, [eax + edi]  // read pixel 3
     punpckldq  xmm6, xmm0     // combine pixel 2 and 3
     addps      xmm3, xmm4    // x, y += dx, dy next 2
-    sub        ecx, 4
     movq       qword ptr 8[edx], xmm6
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jge        l4
 
   l4b:
@@ -6174,7 +5055,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     jl         l1b
 
     // 1 pixel loop
-    align      4
   l1:
     cvttps2dq  xmm0, xmm2    // x, y float to int
     packssdw   xmm0, xmm0    // x, y as shorts
@@ -6182,9 +5062,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     addps      xmm2, xmm7    // x, y += dx, dy
     movd       esi, xmm0
     movd       xmm0, [eax + esi]  // copy a pixel
-    sub        ecx, 1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
+    sub        ecx, 1
     jge        l1
   l1b:
     pop        edi
@@ -6195,11 +5075,11 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
 #ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 16x2 -> 16x1
+// Bilinear filter 32x2 -> 32x1
 __declspec(naked) __declspec(align(16))
 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   __asm {
     push       esi
     push       edi
@@ -6229,7 +5109,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
     vpxor      ymm0, ymm0, ymm0
     vpermd     ymm5, ymm0, ymm5
 
-    align      4
   xloop:
     vmovdqu    ymm0, [esi]
     vmovdqu    ymm2, [esi + edx]
@@ -6240,51 +5119,49 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
     vpsrlw     ymm0, ymm0, 7
     vpsrlw     ymm1, ymm1, 7
     vpackuswb  ymm0, ymm0, ymm1  // unmutates
-    sub        ecx, 32
-    vmovdqu    [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    vmovdqu    ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi + edx]
-    vpavgb     ymm0, ymm0, [esi + edx]
-    sub        ecx, 32
-    vmovdqu    [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    vmovdqu    ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi + edx]
-    sub        ecx, 32
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    vmovdqu    ymm0, [esi + edx]
-    vpavgb     ymm0, ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi]
     sub        ecx, 32
-    vmovdqu     [esi + edi], ymm0
-    lea        esi, [esi + 32]
-    jg         xloop75
+    jg         xloop
     jmp        xloop99
 
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    rep movsb
+   // Blend 25 / 75.
+ xloop25:
+   vmovdqu    ymm0, [esi]
+   vmovdqu    ymm1, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop25
+   jmp        xloop99
+
+   // Blend 50 / 50.
+ xloop50:
+   vmovdqu    ymm0, [esi]
+   vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop50
+   jmp        xloop99
+
+   // Blend 75 / 25.
+ xloop75:
+   vmovdqu    ymm1, [esi]
+   vmovdqu    ymm0, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
+   vpavgb     ymm0, ymm0, ymm1
+   vmovdqu    [esi + edi], ymm0
+   lea        esi, [esi + 32]
+   sub        ecx, 32
+   jg         xloop75
+   jmp        xloop99
+
+   // Blend 100 / 0 - Copy row unchanged.
+ xloop100:
+   rep movsb
 
   xloop99:
     pop        edi
@@ -6295,7 +5172,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
-#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -6329,226 +5205,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
 
-    align      4
-  xloop:
-    movdqa     xmm0, [esi]
-    movdqa     xmm2, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2
-    punpckhbw  xmm1, xmm2
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm1, xmm5
-    psrlw      xmm0, 7
-    psrlw      xmm1, 7
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_SSSE3
-
-#ifdef HAS_INTERPOLATEROW_SSE2
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 256.  Blend 100 / 0.
-    cmp        eax, 64
-    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
-    cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
-    cmp        eax, 192
-    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
-
-    movd       xmm5, eax            // xmm5 = y fraction
-    punpcklbw  xmm5, xmm5
-    psrlw      xmm5, 1
-    punpcklwd  xmm5, xmm5
-    punpckldq  xmm5, xmm5
-    punpcklqdq xmm5, xmm5
-    pxor       xmm4, xmm4
-
-    align      4
-  xloop:
-    movdqa     xmm0, [esi]  // row0
-    movdqa     xmm2, [esi + edx]  // row1
-    movdqa     xmm1, xmm0
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    psubw      xmm2, xmm0  // row1 - row0
-    psubw      xmm3, xmm1
-    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
-    paddw      xmm3, xmm3
-    pmulhw     xmm2, xmm5  // scale diff
-    pmulhw     xmm3, xmm5
-    paddw      xmm0, xmm2  // sum rows
-    paddw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop
-    jmp        xloop99
-
-    // Blend 25 / 75.
-    align      4
-  xloop25:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop25
-    jmp        xloop99
-
-    // Blend 50 / 50.
-    align      4
-  xloop50:
-    movdqa     xmm0, [esi]
-    movdqa     xmm1, [esi + edx]
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop50
-    jmp        xloop99
-
-    // Blend 75 / 25.
-    align      4
-  xloop75:
-    movdqa     xmm1, [esi]
-    movdqa     xmm0, [esi + edx]
-    pavgb      xmm0, xmm1
-    pavgb      xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop75
-    jmp        xloop99
-
-    // Blend 100 / 0 - Copy row unchanged.
-    align      4
-  xloop100:
-    movdqa     xmm0, [esi]
-    sub        ecx, 16
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop100
-
-  xloop99:
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-#endif  // HAS_INTERPOLATEROW_SSE2
-
-// Bilinear filter 16x2 -> 16x1
-__declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                                    ptrdiff_t src_stride, int dst_width,
-                                    int source_y_fraction) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
-    mov        edx, [esp + 8 + 12]  // src_stride
-    mov        ecx, [esp + 8 + 16]  // dst_width
-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
-    sub        edi, esi
-    shr        eax, 1
-    // Dispatch to specialized filters if applicable.
-    cmp        eax, 0
-    je         xloop100  // 0 / 128.  Blend 100 / 0.
-    cmp        eax, 32
-    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
-    cmp        eax, 64
-    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
-    cmp        eax, 96
-    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
-
-    movd       xmm0, eax  // high fraction 0..127
-    neg        eax
-    add        eax, 128
-    movd       xmm5, eax  // low fraction 128..1
-    punpcklbw  xmm5, xmm0
-    punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
-
-    align      4
   xloop:
     movdqu     xmm0, [esi]
     movdqu     xmm2, [esi + edx]
@@ -6560,57 +5216,53 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     psrlw      xmm0, 7
     psrlw      xmm1, 7
     packuswb   xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      4
   xloop25:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop25
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      4
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop50
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      4
   xloop75:
     movdqu     xmm1, [esi]
     movdqu     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop75
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      4
   xloop100:
     movdqu     xmm0, [esi]
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop100
 
   xloop99:
@@ -6623,9 +5275,9 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
-void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                                   ptrdiff_t src_stride, int dst_width,
-                                   int source_y_fraction) {
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) {
   __asm {
     push       esi
     push       edi
@@ -6653,7 +5305,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     punpcklqdq xmm5, xmm5
     pxor       xmm4, xmm4
 
-    align      4
   xloop:
     movdqu     xmm0, [esi]  // row0
     movdqu     xmm2, [esi + edx]  // row1
@@ -6672,57 +5323,53 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     paddw      xmm0, xmm2  // sum rows
     paddw      xmm1, xmm3
     packuswb   xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      4
   xloop25:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop25
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      4
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop50
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      4
   xloop75:
     movdqu     xmm1, [esi]
     movdqu     xmm0, [esi + edx]
     pavgb      xmm0, xmm1
     pavgb      xmm0, xmm1
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop75
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      4
   xloop100:
     movdqu     xmm0, [esi]
-    sub        ecx, 16
     movdqu     [esi + edi], xmm0
     lea        esi, [esi + 16]
+    sub        ecx, 16
     jg         xloop100
 
   xloop99:
@@ -6733,84 +5380,6 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
-__declspec(naked) __declspec(align(16))
-void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // src_uv_stride
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    sub        edi, eax
-
-    align      4
-  convertloop:
-    movdqa     xmm0, [eax]
-    pavgb      xmm0, [eax + edx]
-    sub        ecx, 16
-    movdqa     [eax + edi], xmm0
-    lea        eax,  [eax + 16]
-    jg         convertloop
-    pop        edi
-    ret
-  }
-}
-
-#ifdef HAS_HALFROW_AVX2
-__declspec(naked) __declspec(align(16))
-void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) {
-  __asm {
-    push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // src_uv_stride
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // pix
-    sub        edi, eax
-
-    align      4
-  convertloop:
-    vmovdqu    ymm0, [eax]
-    vpavgb     ymm0, ymm0, [eax + edx]
-    sub        ecx, 32
-    vmovdqu    [eax + edi], ymm0
-    lea        eax,  [eax + 32]
-    jg         convertloop
-
-    pop        edi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_HALFROW_AVX2
-
-__declspec(naked) __declspec(align(16))
-void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
-                          uint32 selector, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_bayer
-    movd       xmm5, [esp + 12]  // selector
-    mov        ecx, [esp + 16]   // pix
-    pshufd     xmm5, xmm5, 0
-
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm5
-    punpckldq  xmm0, xmm1
-    sub        ecx, 8
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jg         wloop
-    ret
-  }
-}
-
 // Specialized ARGB to Bayer that just isolates G channel.
 __declspec(naked) __declspec(align(16))
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
@@ -6823,10 +5392,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
     pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
     psrld      xmm5, 24
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax, [eax + 32]
     psrld      xmm0, 8  // Move green to bottom.
     psrld      xmm1, 8
@@ -6834,9 +5402,9 @@ void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
     pand       xmm1, xmm5
     packssdw   xmm0, xmm1
     packuswb   xmm0, xmm1
-    sub        ecx, 8
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
+    sub        ecx, 8
     jg         wloop
     ret
   }
@@ -6850,46 +5418,19 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     mov        eax, [esp + 4]    // src_argb
     mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
-    movdqa     xmm5, [ecx]
-    mov        ecx, [esp + 16]   // pix
-
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm5
-    pshufb     xmm1, xmm5
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
-    lea        edx, [edx + 32]
-    jg         wloop
-    ret
-  }
-}
-
-__declspec(naked) __declspec(align(16))
-void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                    const uint8* shuffler, int pix) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // shuffler
-    movdqa     xmm5, [ecx]
+    movdqu     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
 
-    align      4
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm5
     pshufb     xmm1, xmm5
-    sub        ecx, 8
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         wloop
     ret
   }
@@ -6906,17 +5447,16 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
     mov        ecx, [esp + 16]    // pix
 
-    align      4
   wloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax, [eax + 64]
     vpshufb    ymm0, ymm0, ymm5
     vpshufb    ymm1, ymm1, ymm5
-    sub        ecx, 16
     vmovdqu    [edx], ymm0
     vmovdqu    [edx + 32], ymm1
     lea        edx, [edx + 64]
+    sub        ecx, 16
     jg         wloop
 
     vzeroupper
@@ -6967,7 +5507,6 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     jg         shuf_any1
     jmp        shuf99
 
-    align      4
   shuf_0123:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -6979,13 +5518,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     pshufhw    xmm1, xmm1, 01Bh
     pshuflw    xmm1, xmm1, 01Bh
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_0123
     jmp        shuf99
 
-    align      4
   shuf_0321:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -6997,13 +5535,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     pshufhw    xmm1, xmm1, 039h
     pshuflw    xmm1, xmm1, 039h
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_0321
     jmp        shuf99
 
-    align      4
   shuf_2103:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -7015,13 +5552,12 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     pshufhw    xmm1, xmm1, 093h
     pshuflw    xmm1, xmm1, 093h
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_2103
     jmp        shuf99
 
-    align      4
   shuf_3012:
     movdqu     xmm0, [eax]
     lea        eax, [eax + 16]
@@ -7033,9 +5569,9 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     pshufhw    xmm1, xmm1, 0C6h
     pshuflw    xmm1, xmm1, 0C6h
     packuswb   xmm0, xmm1
-    sub        ecx, 4
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         shuf_3012
 
   shuf99:
@@ -7066,7 +5602,6 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
-    align      4
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
@@ -7104,7 +5639,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
-    align      4
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
@@ -7141,7 +5675,6 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
     // 2 pixel loop.
-    align      4
  convertloop:
 //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
 //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
@@ -7177,9 +5710,9 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
     cvttps2dq  xmm4, xmm4
     packuswb   xmm0, xmm4
     packuswb   xmm0, xmm0
-    sub        ecx, 2
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
+    sub        ecx, 2
     jg         convertloop
     pop        esi
     ret
@@ -7203,7 +5736,6 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
     mov        ecx, [esp + 16]  /* width */
 
     // 2 pixel loop.
-    align      4
  convertloop:
     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
     lea         eax, [eax + 8]
@@ -7217,9 +5749,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
     vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
     vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
     vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
-    sub         ecx, 2
     vmovq       qword ptr [edx], xmm0
     lea         edx, [edx + 8]
+    sub         ecx, 2
     jg          convertloop
     vzeroupper
     ret
@@ -7239,7 +5771,6 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
     mov        ecx, [esp + 4 + 12]  /* width */
 
     // 1 pixel loop.
-    align      4
   convertloop:
     movzx      edx, byte ptr [eax]
     lea        eax, [eax + 4]
@@ -7273,7 +5804,6 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
     mov        ecx, [esp + 4 + 12]  /* width */
 
     // 1 pixel loop.
-    align      4
   convertloop:
     movzx      edx, byte ptr [eax]
     lea        eax, [eax + 4]
@@ -7315,7 +5845,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     pxor       xmm5, xmm5
 
     // 4 pixel loop.
-    align      4
   convertloop:
     movdqu     xmm0, qword ptr [eax]      // generate luma ptr
     pmaddubsw  xmm0, xmm3
@@ -7382,9 +5911,9 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     movzx      edx, byte ptr [eax + 15]  // copy alpha.
     mov        byte ptr [edi + 15], dl
 
-    sub        ecx, 4
     lea        eax, [eax + 16]
     lea        edi, [edi + 16]
+    sub        ecx, 4
     jg         convertloop
 
     pop        edi
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc
index 5b33b5f048d..482c5a61e35 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale.cc
@@ -57,20 +57,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
     ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
   }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
-        ScaleRowDown2Box_Unaligned_SSE2);
-    if (IS_ALIGNED(src_ptr, 16) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
-          ScaleRowDown2Box_SSE2);
-    }
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+        ScaleRowDown2Box_SSE2);
   }
-#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -112,21 +107,15 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
     ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
         ScaleRowDown2_16_NEON;
   }
-#elif defined(HAS_SCALEROWDOWN2_16_SSE2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ?
-        ScaleRowDown2_Unaligned_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :
-        ScaleRowDown2Box_Unaligned_16_SSE2);
-    if (IS_ALIGNED(src_ptr, 16) &&
-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
-          ScaleRowDown2Box_16_SSE2);
-    }
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
+        ScaleRowDown2Box_16_SSE2);
   }
-#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
       IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -168,13 +157,13 @@ static void ScalePlaneDown4(int src_width, int src_height,
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
     ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
   }
-#elif defined(HAS_SCALEROWDOWN4_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN4_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
     ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
   }
-#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -212,14 +201,14 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
     ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
         ScaleRowDown4_16_NEON;
   }
-#elif defined(HAS_SCALEROWDOWN4_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
     ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
         ScaleRowDown4_16_SSE2;
   }
-#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -271,8 +260,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
     if (!filtering) {
       ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
       ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
@@ -351,8 +339,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_SCALEROWDOWN34_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
     if (!filtering) {
       ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;
       ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;
@@ -445,9 +432,9 @@ static void ScalePlaneDown38(int src_width, int src_height,
       ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN38_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
     if (!filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
@@ -456,7 +443,8 @@ static void ScalePlaneDown38(int src_width, int src_height,
       ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -522,9 +510,9 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
       ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) {
     if (!filtering) {
       ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;
       ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;
@@ -533,7 +521,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
       ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;
     }
   }
-#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
+#endif
+#if defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)
   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
       IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
@@ -758,11 +747,11 @@ static void ScalePlaneBox(int src_width, int src_height,
         uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;
 
 #if defined(HAS_SCALEADDROWS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
+    if (TestCpuFlag(kCpuHasSSE2)
 #ifdef AVOID_OVERREAD
-        IS_ALIGNED(src_width, 16) &&
+         && IS_ALIGNED(src_width, 16)
 #endif
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+        ) {
       ScaleAddRows = ScaleAddRows_SSE2;
     }
 #endif
@@ -830,11 +819,11 @@ static void ScalePlaneBox_16(int src_width, int src_height,
         uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;
 
 #if defined(HAS_SCALEADDROWS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
+    if (TestCpuFlag(kCpuHasSSE2)
 #ifdef AVOID_OVERREAD
-        IS_ALIGNED(src_width, 16) &&
+        && IS_ALIGNED(src_width, 16)
 #endif
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+        ) {
       ScaleAddRows = ScaleAddRows_16_SSE2;
     }
 #endif
@@ -886,29 +875,23 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(src_width, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -916,7 +899,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(src_width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
@@ -924,7 +907,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(src_width, 4)) {
       InterpolateRow = InterpolateRow_MIPS_DSPR2;
@@ -988,29 +971,23 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_16_SSE2;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSE2;
-      }
+      InterpolateRow = InterpolateRow_16_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_16_SSSE3;
     if (IS_ALIGNED(src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_16_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_16_AVX2;
     if (IS_ALIGNED(src_width, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
@@ -1018,7 +995,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_16_NEON;
     if (IS_ALIGNED(src_width, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
@@ -1026,7 +1003,7 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
     if (IS_ALIGNED(src_width, 4)) {
       InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@@ -1087,29 +1064,23 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -1117,7 +1088,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
@@ -1125,7 +1096,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_MIPS_DSPR2;
@@ -1144,9 +1115,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_C;
 #if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleFilterCols = ScaleColsUp2_SSE2;
     }
 #endif
@@ -1226,29 +1195,23 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_16_SSE2;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSE2;
-      }
+      InterpolateRow = InterpolateRow_16_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_16_SSSE3;
     if (IS_ALIGNED(dst_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_16_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_16_AVX2;
     if (IS_ALIGNED(dst_width, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
@@ -1256,7 +1219,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_16_NEON;
     if (IS_ALIGNED(dst_width, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
@@ -1264,7 +1227,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
     InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_16_MIPS_DSPR2;
@@ -1283,9 +1246,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_16_C;
 #if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleFilterCols = ScaleColsUp2_16_SSE2;
     }
 #endif
@@ -1366,9 +1327,7 @@ static void ScalePlaneSimple(int src_width, int src_height,
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleCols = ScaleColsUp2_C;
 #if defined(HAS_SCALECOLS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleCols = ScaleColsUp2_SSE2;
     }
 #endif
@@ -1401,9 +1360,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleCols = ScaleColsUp2_16_C;
 #if defined(HAS_SCALECOLS_16_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleCols = ScaleColsUp2_16_SSE2;
     }
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc
index e339cd7c791..05b58e1baba 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_argb.cc
@@ -53,16 +53,14 @@ static void ScaleARGBDown2(int src_width, int src_height,
   }
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
     ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
         (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
         ScaleARGBRowDown2Box_SSE2);
   }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
     ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
         ScaleARGBRowDown2_NEON;
   }
@@ -98,14 +96,12 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
   assert(dx == 65536 * 4);  // Test scale factor of 4.
   assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
     ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
   }
-#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
     ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
   }
 #endif
@@ -139,14 +135,13 @@ static void ScaleARGBDownEven(int src_width, int src_height,
   assert(IS_ALIGNED(src_height, 2));
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
         ScaleARGBRowDownEven_SSE2;
   }
-#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
-      IS_ALIGNED(src_argb, 4)) {
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) {
     ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
         ScaleARGBRowDownEven_NEON;
   }
@@ -190,29 +185,23 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
   src_argb += xl * 4;
   x -= (int)(xl << 16);
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(clip_src_width, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(clip_src_width, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -220,15 +209,15 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(clip_src_width, 16)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
     if (IS_ALIGNED(clip_src_width, 4)) {
@@ -286,29 +275,23 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width, 8)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -316,15 +299,15 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_MIPS_DSPR2;
   }
@@ -346,9 +329,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
@@ -427,18 +408,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
                         uint8* rgb_buf,
                         int width) = I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(src_width, 8)) {
-      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        I422ToARGBRow = I422ToARGBRow_SSSE3;
-      }
+      I422ToARGBRow = I422ToARGBRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_I422TOARGBROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     I422ToARGBRow = I422ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(src_width, 16)) {
       I422ToARGBRow = I422ToARGBRow_AVX2;
@@ -446,7 +424,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_I422TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     I422ToARGBRow = I422ToARGBRow_Any_NEON;
     if (IS_ALIGNED(src_width, 8)) {
       I422ToARGBRow = I422ToARGBRow_NEON;
@@ -467,29 +445,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
       ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
       InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width, 8)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -497,15 +469,15 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width, 4)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
     InterpolateRow = InterpolateRow_MIPS_DSPR2;
   }
@@ -531,9 +503,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
@@ -640,9 +610,7 @@ static void ScaleARGBSimple(int src_width, int src_height,
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBCols = ScaleARGBColsUp2_C;
 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
-        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
       ScaleARGBCols = ScaleARGBColsUp2_SSE2;
     }
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc
index e4b2acc41b1..96e2564b00d 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_common.cc
@@ -885,31 +885,23 @@ void ScalePlaneVertical(int src_height,
   assert(dst_height > 0);
   src_argb += (x >> 16) * bpp;
 #if defined(HAS_INTERPOLATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_SSE2;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSE2;
-      }
+      InterpolateRow = InterpolateRow_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_AVX2;
     if (IS_ALIGNED(dst_width_bytes, 32)) {
       InterpolateRow = InterpolateRow_AVX2;
@@ -917,15 +909,15 @@ void ScalePlaneVertical(int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_NEON;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
       InterpolateRow = InterpolateRow_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
@@ -967,31 +959,23 @@ void ScalePlaneVertical_16(int src_height,
   assert(dst_height > 0);
   src_argb += (x >> 16) * wpp;
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSE2)) {
     InterpolateRow = InterpolateRow_Any_16_SSE2;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSE2;
-      }
+      InterpolateRow = InterpolateRow_16_SSE2;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_16_SSSE3;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
-      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;
-      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
-          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-        InterpolateRow = InterpolateRow_16_SSSE3;
-      }
+      InterpolateRow = InterpolateRow_16_SSSE3;
     }
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+  if (TestCpuFlag(kCpuHasAVX2)) {
     InterpolateRow = InterpolateRow_Any_16_AVX2;
     if (IS_ALIGNED(dst_width_bytes, 32)) {
       InterpolateRow = InterpolateRow_16_AVX2;
@@ -999,15 +983,15 @@ void ScalePlaneVertical_16(int src_height,
   }
 #endif
 #if defined(HAS_INTERPOLATEROW_16_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+  if (TestCpuFlag(kCpuHasNEON)) {
     InterpolateRow = InterpolateRow_Any_16_NEON;
     if (IS_ALIGNED(dst_width_bytes, 16)) {
       InterpolateRow = InterpolateRow_16_NEON;
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2)
-  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
       IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
       IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
     InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc
index 1b8a5ba58f2..7921219b5fa 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon.cc
@@ -16,7 +16,8 @@ extern "C" {
 #endif
 
 // This module is for GCC Neon.
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
+    !defined(__aarch64__)
 
 // NEON downscalers with interpolation.
 // Provided by Fritz Koenig
@@ -756,7 +757,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
   );
 }
 
-#endif  // __ARM_NEON__
+#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc
index 64c7d10dbe1..fb68b67d29c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_neon64.cc
@@ -8,133 +8,122 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "libyuv/scale.h"
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for GCC Neon.
+// This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
-#ifdef HAS_SCALEROWDOWN2_NEON
+
 // Read 32x1 throw away even pixels, and write 16x1.
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst, int dst_width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
-    // load even pixels into q0, odd into q1
+    // load even pixels into v0, odd into v1
     MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"
+    "ld2        {v0.16b,v1.16b}, [%0], #32    \n"
     "subs       %2, %2, #16                    \n"  // 16 processed per loop
     MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    "bgt        1b                             \n"
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    "b.gt       1b                             \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst),              // %1
     "+r"(dst_width)         // %2
   :
-  : "q0", "q1"              // Clobber List
+  : "v0", "v1"              // Clobber List
   );
 }
-#endif //HAS_SCALEROWDOWN2_NEON
 
-#ifdef HAS_SCALEROWDOWN2_NEON
 // Read 32x2 average down and write 16x1.
 void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst, int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-    ".p2align   2                              \n"
+    "add        %1, %1, %0                     \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
     MEMACCESS(1)
-    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
-    "vpadal.u8  q1, q3                         \n"
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
+    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+    "uaddlp     v1.8h, v1.16b                  \n"
+    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
+    "uadalp     v1.8h, v3.16b                  \n"
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn2     v0.16b, v1.8h, #2              \n"
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%2], #16            \n"
+    "b.gt       1b                             \n"
   : "+r"(src_ptr),          // %0
     "+r"(src_stride),       // %1
     "+r"(dst),              // %2
     "+r"(dst_width)         // %3
   :
-  : "q0", "q1", "q2", "q3"     // Clobber List
+  : "v0", "v1", "v2", "v3"     // Clobber List
   );
 }
-#endif //HAS_SCALEROWDOWN2_NEON
 
-#ifdef HAS_SCALEROWDOWN4_NEON
 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
-    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
     MEMACCESS(1)
-    "vst1.8     {d2}, [%1]!                    \n"
-    "bgt        1b                             \n"
+    "st1     {v2.8b}, [%1], #8                 \n"
+    "b.gt       1b                             \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width)         // %2
   :
-  : "q0", "q1", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "memory", "cc"
   );
 }
-#endif //HAS_SCALEROWDOWN4_NEON
 
-#ifdef HAS_SCALEROWDOWN4_NEON
 void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                            uint8* dst_ptr, int dst_width) {
   const uint8* src_ptr1 = src_ptr + src_stride;
   const uint8* src_ptr2 = src_ptr + src_stride * 2;
   const uint8* src_ptr3 = src_ptr + src_stride * 3;
 asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
+    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
     MEMACCESS(3)
-    "vld1.8     {q1}, [%3]!                    \n"
+    "ld1     {v1.16b}, [%2], #16               \n"
     MEMACCESS(4)
-    "vld1.8     {q2}, [%4]!                    \n"
+    "ld1     {v2.16b}, [%3], #16               \n"
     MEMACCESS(5)
-    "vld1.8     {q3}, [%5]!                    \n"
-    "subs       %2, %2, #4                     \n"
-    "vpaddl.u8  q0, q0                         \n"
-    "vpadal.u8  q0, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"
-    "vpadal.u8  q0, q3                         \n"
-    "vpaddl.u16 q0, q0                         \n"
-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
-    "vmovn.u16  d0, q0                         \n"
+    "ld1     {v3.16b}, [%4], #16               \n"
+    "subs    %5, %5, #4                        \n"
+    "uaddlp  v0.8h, v0.16b                     \n"
+    "uadalp  v0.8h, v1.16b                     \n"
+    "uadalp  v0.8h, v2.16b                     \n"
+    "uadalp  v0.8h, v3.16b                     \n"
+    "addp    v0.8h, v0.8h, v0.8h               \n"
+    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
     MEMACCESS(1)
-    "vst1.32    {d0[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
+    "st1    {v0.s}[0], [%1], #4                \n"
+    "b.gt       1b                             \n"
   : "+r"(src_ptr),   // %0
     "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
-    "+r"(src_ptr1),  // %3
-    "+r"(src_ptr2),  // %4
-    "+r"(src_ptr3)   // %5
+    "+r"(src_ptr1),  // %2
+    "+r"(src_ptr2),  // %3
+    "+r"(src_ptr3),  // %4
+    "+r"(dst_width)  // %5
   :
-  : "q0", "q1", "q2", "q3", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "memory", "cc"
   );
 }
-#endif //HAS_SCALEROWDOWN4_NEON
 
-#ifdef HAS_SCALEROWDOWN34_NEON
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
@@ -142,136 +131,129 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
                          ptrdiff_t src_stride,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
-    ".p2align   2                              \n"
-  "1:                                          \n"
+  "1:                                                  \n"
     MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "subs       %2, %2, #24                  \n"
-    "vmov       d2, d3                       \n" // order d0, d1, d2
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
+    "subs      %2, %2, #24                             \n"
+    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
     MEMACCESS(1)
-    "vst3.8     {d0, d1, d2}, [%1]!          \n"
-    "bgt        1b                           \n"
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width)         // %2
   :
-  : "d0", "d1", "d2", "d3", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "memory", "cc"
   );
 }
-#endif //HAS_SCALEROWDOWN34_NEON
 
-#ifdef HAS_SCALEROWDOWN34_NEON
 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
     MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
     MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %2, %2, #24                          \n"
 
     // filter src line 0 with src line 1
     // expand chars to shorts to allow for room
     // when adding lines together
-    "vmovl.u8     q8, d4                       \n"
-    "vmovl.u8     q9, d5                       \n"
-    "vmovl.u8     q10, d6                      \n"
-    "vmovl.u8     q11, d7                      \n"
+    "ushll     v16.8h, v4.8b, #0                       \n"
+    "ushll     v17.8h, v5.8b, #0                       \n"
+    "ushll     v18.8h, v6.8b, #0                       \n"
+    "ushll     v19.8h, v7.8b, #0                       \n"
 
     // 3 * line_0 + line_1
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vmlal.u8     q9, d1, d24                  \n"
-    "vmlal.u8     q10, d2, d24                 \n"
-    "vmlal.u8     q11, d3, d24                 \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "umlal     v17.8h, v1.8b, v20.8b                   \n"
+    "umlal     v18.8h, v2.8b, v20.8b                   \n"
+    "umlal     v19.8h, v3.8b, v20.8b                   \n"
 
     // (3 * line_0 + line_1) >> 2
-    "vqrshrn.u16  d0, q8, #2                   \n"
-    "vqrshrn.u16  d1, q9, #2                   \n"
-    "vqrshrn.u16  d2, q10, #2                  \n"
-    "vqrshrn.u16  d3, q11, #2                  \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
+    "uqrshrn   v1.8b, v17.8h, #2                       \n"
+    "uqrshrn   v2.8b, v18.8h, #2                       \n"
+    "uqrshrn   v3.8b, v19.8h, #2                       \n"
 
     // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q8, d1                       \n"
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vqrshrn.u16  d0, q8, #2                   \n"
+    "ushll     v16.8h, v1.8b, #0                       \n"
+    "umlal     v16.8h, v0.8b, v20.8b                   \n"
+    "uqrshrn   v0.8b, v16.8h, #2                       \n"
 
     // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
 
     // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q8, d2                       \n"
-    "vmlal.u8     q8, d3, d24                  \n"
-    "vqrshrn.u16  d2, q8, #2                   \n"
+    "ushll     v16.8h, v2.8b, #0                       \n"
+    "umlal     v16.8h, v3.8b, v20.8b                   \n"
+    "uqrshrn   v2.8b, v16.8h, #2                       \n"
 
     MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
 
-    "bgt          1b                           \n"
+    "b.gt      1b                                      \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width),        // %2
     "+r"(src_stride)        // %3
   :
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
+    "v20", "memory", "cc"
   );
 }
-#endif //ScaleRowDown34_0_Box_NEON
 
-#ifdef HAS_SCALEROWDOWN34_NEON
 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
+    "movi      v20.8b, #3                              \n"
+    "add       %3, %3, %0                              \n"
+  "1:                                                  \n"
     MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
     MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
+    "subs         %2, %2, #24                          \n"
     // average src line 0 with src line 1
-    "vrhadd.u8    q0, q0, q2                   \n"
-    "vrhadd.u8    q1, q1, q3                   \n"
+    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
 
     // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q3, d1                       \n"
-    "vmlal.u8     q3, d0, d24                  \n"
-    "vqrshrn.u16  d0, q3, #2                   \n"
+    "ushll     v4.8h, v1.8b, #0                        \n"
+    "umlal     v4.8h, v0.8b, v20.8b                    \n"
+    "uqrshrn   v0.8b, v4.8h, #2                        \n"
 
     // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
+    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
 
     // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q3, d2                       \n"
-    "vmlal.u8     q3, d3, d24                  \n"
-    "vqrshrn.u16  d2, q3, #2                   \n"
+    "ushll     v4.8h, v2.8b, #0                        \n"
+    "umlal     v4.8h, v3.8b, v20.8b                    \n"
+    "uqrshrn   v2.8b, v4.8h, #2                        \n"
 
     MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-    "bgt          1b                           \n"
+    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
+    "b.gt      1b                                      \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width),        // %2
     "+r"(src_stride)        // %3
   :
-  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
   );
 }
-#endif //HAS_SCALEROWDOWN34_NEON
 
-#ifdef HAS_SCALEROWDOWN38_NEON
-#define HAS_SCALEROWDOWN38_NEON
 static uvec8 kShuf38 =
   { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
 static uvec8 kShuf38_2 =
-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
 static vec16 kMult38_Div6 =
   { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
     65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
@@ -285,504 +267,498 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
     MEMACCESS(3)
-    "vld1.8     {q3}, [%3]                     \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
+    "ld1       {v3.16b}, [%3]                          \n"
+  "1:                                                  \n"
     MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
-    "subs       %2, %2, #12                    \n"
-    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
-    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
+    "subs      %2, %2, #12                             \n"
+    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
     MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"
+    "st1       {v2.8b}, [%1], #8                       \n"
     MEMACCESS(1)
-    "vst1.32    {d5[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
+    "st1       {v2.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width)         // %2
   : "r"(&kShuf38)           // %3
-  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "memory", "cc"
   );
 }
 
-#endif //HAS_SCALEROWDOWN38_NEON
-
-#ifdef HAS_SCALEROWDOWN38_NEON
 // 32x3 -> 12x1
 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                       ptrdiff_t src_stride,
                                       uint8* dst_ptr, int dst_width) {
   const uint8* src_ptr1 = src_ptr + src_stride * 2;
+  ptrdiff_t tmp_src_stride = src_stride;
 
   asm volatile (
     MEMACCESS(5)
-    "vld1.16    {q13}, [%5]                    \n"
+    "ld1       {v29.8h}, [%5]                          \n"
     MEMACCESS(6)
-    "vld1.8     {q14}, [%6]                    \n"
+    "ld1       {v30.16b}, [%6]                         \n"
     MEMACCESS(7)
-    "vld1.8     {q15}, [%7]                    \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
+    "ld1       {v31.8h}, [%7]                          \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
     MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
     MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
     MEMACCESS(4)
-    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
-    "subs         %2, %2, #12                  \n"
+    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
+    "subs      %4, %4, #12                             \n"
 
     // Shuffle the input data around to get align the data
     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-    "vtrn.u8      d16, d17                     \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-    "vtrn.u8      d18, d19                     \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-    "vpaddl.u8    q8, q8                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-    "vpaddl.u8    d19, d19                     \n"
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v20.8b, v0.8b, v1.8b                    \n"
+    "trn2      v21.8b, v0.8b, v1.8b                    \n"
+    "trn1      v22.8b, v4.8b, v5.8b                    \n"
+    "trn2      v23.8b, v4.8b, v5.8b                    \n"
+    "trn1      v24.8b, v16.8b, v17.8b                  \n"
+    "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+    "trn1      v16.8b, v18.8b, v19.8b                  \n"
+    "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v20.4h, v20.8b                          \n"
+    "uaddlp    v21.4h, v21.8b                          \n"
+    "uaddlp    v22.4h, v22.8b                          \n"
+    "uaddlp    v23.4h, v23.8b                          \n"
+    "uaddlp    v24.4h, v24.8b                          \n"
+    "uaddlp    v25.4h, v25.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
 
     // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     q0, q8                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-    "vadd.u16     d4, d19                      \n"
+    "add       v20.4h, v20.4h, v22.4h                  \n"
+    "add       v21.4h, v21.4h, v23.4h                  \n"
+    "add       v20.4h, v20.4h, v24.4h                  \n"
+    "add       v21.4h, v21.4h, v25.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
+    "add       v2.4h, v2.4h, v17.4h                    \n"
 
     // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
     //             + s[6 + st * 1] + s[7 + st * 1]
     //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "vqrdmulh.s16 q2, q2, q13                  \n"
-    "vmovn.u16    d4, q2                       \n"
+    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+    "xtn       v2.8b,  v2.8h                           \n"
 
     // Shuffle 2,3 reg around so that 2 can be added to the
     //  0,1 reg and 3 can be added to the 4,5 reg. This
     //  requires expanding from u8 to u16 as the 0,1 and 4,5
     //  registers are already expanded. Then do transposes
     //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-    "vmovl.u8     q9, d18                      \n"
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+    "ushll     v16.8h, v16.8b, #0                      \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
 
     // combine source lines
-    "vadd.u16     q1, q3                       \n"
-    "vadd.u16     q1, q9                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
+    "add       v0.8h, v0.8h, v16.8h                    \n"
 
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
 
     // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
+    "add       v20.8h, v20.8h, v0.8h                   \n"
+    "add       v21.8h, v21.8h, v4.8h                   \n"
 
     // Need to divide, but can't downshift as the the value
     //  isn't a power of 2. So multiply by 65536 / n
     //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q15                  \n"
+    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
 
     // Align for table lookup, vtbl requires registers to
     //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
 
     MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
+    "st1       {v3.8b}, [%1], #8                       \n"
     MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_ptr1)          // %4
+    "+r"(tmp_src_stride),   // %2
+    "+r"(src_ptr1),         // %3
+    "+r"(dst_width)         // %4
   : "r"(&kMult38_Div6),     // %5
     "r"(&kShuf38_2),        // %6
     "r"(&kMult38_Div9)      // %7
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
+    "v30", "v31", "memory", "cc"
   );
 }
-#endif //HAS_SCALEROWDOWN38_NEON
 
-#ifdef HAS_SCALEROWDOWN38_NEON
 // 32x2 -> 12x1
 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
+  // TODO(fbarchard): use src_stride directly for clang 3.5+.
+  ptrdiff_t tmp_src_stride = src_stride;
   asm volatile (
     MEMACCESS(4)
-    "vld1.16    {q13}, [%4]                    \n"
+    "ld1       {v30.8h}, [%4]                          \n"
     MEMACCESS(5)
-    "vld1.8     {q14}, [%5]                    \n"
-    "add        %3, %0                         \n"
-    ".p2align   2                              \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
+    "ld1       {v31.16b}, [%5]                         \n"
+    "add       %2, %2, %0                              \n"
+  "1:                                                  \n"
+
+    // 00 40 01 41 02 42 03 43
+    // 10 50 11 51 12 52 13 53
+    // 20 60 21 61 22 62 23 63
+    // 30 70 31 71 32 72 33 73
     MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
     MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "subs         %2, %2, #12                  \n"
+    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
+    "subs      %3, %3, #12                             \n"
 
     // Shuffle the input data around to get align the data
     //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
+    // 00 10 01 11 02 12 03 13
+    // 40 50 41 51 42 52 43 53
+    "trn1      v16.8b, v0.8b, v1.8b                    \n"
+    "trn2      v17.8b, v0.8b, v1.8b                    \n"
+    "trn1      v18.8b, v4.8b, v5.8b                    \n"
+    "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+    // 20 30 21 31 22 32 23 33
+    // 60 70 61 71 62 72 63 73
+    "trn1      v0.8b, v2.8b, v3.8b                     \n"
+    "trn2      v1.8b, v2.8b, v3.8b                     \n"
+    "trn1      v4.8b, v6.8b, v7.8b                     \n"
+    "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+    // 00+10 01+11 02+12 03+13
+    // 40+50 41+51 42+52 43+53
+    "uaddlp    v16.4h, v16.8b                          \n"
+    "uaddlp    v17.4h, v17.8b                          \n"
+    "uaddlp    v18.4h, v18.8b                          \n"
+    "uaddlp    v19.4h, v19.8b                          \n"
+
+    // 60+70 61+71 62+72 63+73
+    "uaddlp    v1.4h, v1.8b                            \n"
+    "uaddlp    v5.4h, v5.8b                            \n"
 
     // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
+    "add       v16.4h, v16.4h, v18.4h                  \n"
+    "add       v17.4h, v17.4h, v19.4h                  \n"
+    "add       v2.4h, v1.4h, v5.4h                     \n"
 
     // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "vqrshrn.u16  d4, q2, #2                   \n"
+    "uqrshrn   v2.8b, v2.8h, #2                        \n"
 
     // Shuffle 2,3 reg around so that 2 can be added to the
     //  0,1 reg and 3 can be added to the 4,5 reg. This
     //  requires expanding from u8 to u16 as the 0,1 and 4,5
     //  registers are already expanded. Then do transposes
     //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
+    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
 
     // combine source lines
-    "vadd.u16     q1, q3                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
+    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
 
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
+    // xx 20 xx 21 xx 22 xx 23
+    // xx 30 xx 31 xx 32 xx 33
+    "trn1      v1.8h, v0.8h, v0.8h                     \n"
+    "trn2      v4.8h, v0.8h, v0.8h                     \n"
+    "xtn       v0.4h, v1.4s                            \n"
+    "xtn       v4.4h, v4.4s                            \n"
 
     // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
+    "add       v16.8h, v16.8h, v0.8h                   \n"
+    "add       v17.8h, v17.8h, v4.8h                   \n"
 
     // Need to divide, but can't downshift as the the value
     //  isn't a power of 2. So multiply by 65536 / n
     //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q13                  \n"
+    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
 
     // Align for table lookup, vtbl requires registers to
     //  be adjacent
-    "vmov.u8      d2, d4                       \n"
 
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
 
     MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
+    "st1       {v3.8b}, [%1], #8                       \n"
     MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),       // %0
-    "+r"(dst_ptr),       // %1
-    "+r"(dst_width),     // %2
-    "+r"(src_stride)     // %3
-  : "r"(&kMult38_Div6),  // %4
-    "r"(&kShuf38_2)      // %5
-  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+    "st1       {v3.s}[2], [%1], #4                     \n"
+    "b.gt      1b                                      \n"
+  : "+r"(src_ptr),         // %0
+    "+r"(dst_ptr),         // %1
+    "+r"(tmp_src_stride),  // %2
+    "+r"(dst_width)        // %3
+  : "r"(&kMult38_Div6),    // %4
+    "r"(&kShuf38_2)        // %5
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
+    "v18", "v19", "v30", "v31", "memory", "cc"
   );
 }
-#endif //HAS_SCALEROWDOWN38_NEON
 
-#if 0
 // 16x2 -> 16x1
 void ScaleFilterRows_NEON(uint8* dst_ptr,
                           const uint8* src_ptr, ptrdiff_t src_stride,
                           int dst_width, int source_y_fraction) {
+    int y_fraction = 256 - source_y_fraction;
   asm volatile (
     "cmp          %4, #0                       \n"
-    "beq          100f                         \n"
-    "add          %2, %1                       \n"
+    "b.eq         100f                         \n"
+    "add          %2, %2, %1                   \n"
     "cmp          %4, #64                      \n"
-    "beq          75f                          \n"
+    "b.eq         75f                          \n"
     "cmp          %4, #128                     \n"
-    "beq          50f                          \n"
+    "b.eq         50f                          \n"
     "cmp          %4, #192                     \n"
-    "beq          25f                          \n"
+    "b.eq         25f                          \n"
 
-    "vdup.8       d5, %4                       \n"
-    "rsb          %4, #256                     \n"
-    "vdup.8       d4, %4                       \n"
+    "dup          v5.8b, %w4                   \n"
+    "dup          v4.8b, %w5                   \n"
     // General purpose row blend.
   "1:                                          \n"
     MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
+    "ld1          {v0.16b}, [%1], #16          \n"
     MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
+    "ld1          {v1.16b}, [%2], #16          \n"
     "subs         %3, %3, #16                  \n"
-    "vmull.u8     q13, d0, d4                  \n"
-    "vmull.u8     q14, d1, d4                  \n"
-    "vmlal.u8     q13, d2, d5                  \n"
-    "vmlal.u8     q14, d3, d5                  \n"
-    "vrshrn.u16   d0, q13, #8                  \n"
-    "vrshrn.u16   d1, q14, #8                  \n"
+    "umull        v6.8h, v0.8b, v4.8b          \n"
+    "umull2       v7.8h, v0.16b, v4.16b        \n"
+    "umlal        v6.8h, v1.8b, v5.8b          \n"
+    "umlal2       v7.8h, v1.16b, v5.16b        \n"
+    "rshrn        v0.8b, v6.8h, #8             \n"
+    "rshrn2       v0.16b, v7.8h, #8            \n"
     MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          1b                           \n"
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         1b                           \n"
     "b            99f                          \n"
 
     // Blend 25 / 75.
   "25:                                         \n"
     MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
+    "ld1          {v0.16b}, [%1], #16          \n"
     MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
+    "ld1          {v1.16b}, [%2], #16          \n"
     "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
     MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          25b                          \n"
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         25b                          \n"
     "b            99f                          \n"
 
     // Blend 50 / 50.
   "50:                                         \n"
     MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
+    "ld1          {v0.16b}, [%1], #16          \n"
     MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
+    "ld1          {v1.16b}, [%2], #16          \n"
     "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
     MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          50b                          \n"
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         50b                          \n"
     "b            99f                          \n"
 
     // Blend 75 / 25.
   "75:                                         \n"
     MEMACCESS(1)
-    "vld1.8       {q1}, [%1]!                  \n"
+    "ld1          {v1.16b}, [%1], #16          \n"
     MEMACCESS(2)
-    "vld1.8       {q0}, [%2]!                  \n"
+    "ld1          {v0.16b}, [%2], #16          \n"
     "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
+    "urhadd       v0.16b, v0.16b, v1.16b       \n"
     MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          75b                          \n"
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         75b                          \n"
     "b            99f                          \n"
 
     // Blend 100 / 0 - Copy row unchanged.
   "100:                                        \n"
     MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
+    "ld1          {v0.16b}, [%1], #16          \n"
     "subs         %3, %3, #16                  \n"
     MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          100b                         \n"
+    "st1          {v0.16b}, [%0], #16          \n"
+    "b.gt         100b                         \n"
 
   "99:                                         \n"
     MEMACCESS(0)
-    "vst1.8       {d1[7]}, [%0]                \n"
+    "st1          {v0.b}[15], [%0]             \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
     "+r"(src_stride),       // %2
     "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
+    "+r"(source_y_fraction),// %4
+    "+r"(y_fraction)        // %5
   :
-  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
   );
 }
-#endif //0
 
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
 void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                             uint8* dst, int dst_width) {
   asm volatile (
-    ".p2align   2                              \n"
   "1:                                          \n"
     // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.32    {q0, q1}, [%0]!                \n"
-    MEMACCESS(0)
-    "vld2.32    {q2, q3}, [%0]!                \n"
+    MEMACCESS (0)
+    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
+    MEMACCESS (0)
+    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
     "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    MEMACCESS(1)
-    "vst1.8     {q3}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
+    MEMACCESS (1)
+    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+    MEMACCESS (1)
+    "st1        {v3.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (dst),              // %1
+    "+r" (dst_width)         // %2
   :
-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
   );
 }
-#endif //HAS_SCALEARGBROWDOWN2_NEON
 
-#ifdef HAS_SCALEARGBROWDOWN2_NEON
 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
                                uint8* dst, int dst_width) {
   asm volatile (
     // change the stride to row 2 pointer
     "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
   "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    MEMACCESS (0)
+    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    "vrshrn.u16 d2, q2, #2                     \n"
-    "vrshrn.u16 d3, q3, #2                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
+    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+    MEMACCESS (1)
+    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
+    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
+    "rshrn      v1.8b, v1.8h, #2               \n"
+    "rshrn      v2.8b, v2.8h, #2               \n"
+    "rshrn      v3.8b, v3.8h, #2               \n"
+    MEMACCESS (2)
+    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+    "b.gt       1b                             \n"
+  : "+r" (src_ptr),          // %0
+    "+r" (src_stride),       // %1
+    "+r" (dst),              // %2
+    "+r" (dst_width)         // %3
   :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
   );
 }
-#endif //HAS_SCALEARGBROWDOWN2_NEON
 
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
 void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
                                int src_stepx, uint8* dst_argb, int dst_width) {
   asm volatile (
-    "mov        r12, %3, lsl #2                \n"
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.32    {d0[0]}, [%0], r12             \n"
+    "ld1        {v0.s}[0], [%0], %3            \n"
     MEMACCESS(0)
-    "vld1.32    {d0[1]}, [%0], r12             \n"
+    "ld1        {v0.s}[1], [%0], %3            \n"
     MEMACCESS(0)
-    "vld1.32    {d1[0]}, [%0], r12             \n"
+    "ld1        {v0.s}[2], [%0], %3            \n"
     MEMACCESS(0)
-    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "ld1        {v0.s}[3], [%0], %3            \n"
     "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
     MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
+    "st1        {v0.16b}, [%1], #16            \n"
+    "b.gt       1b                             \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
     "+r"(dst_width)    // %2
-  : "r"(src_stepx)     // %3
-  : "memory", "cc", "r12", "q0"
+  : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3
+  : "memory", "cc", "v0"
   );
 }
-#endif //HAS_SCALEARGBROWDOWNEVEN_NEON
 
-#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
+// TODO, might be worth another optimization pass in future.
+// It could be upgraded to 8 pixels at a time to start with.
 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
                                   int src_stepx,
                                   uint8* dst_argb, int dst_width) {
   asm volatile (
-    "mov        r12, %4, lsl #2                \n"
     "add        %1, %1, %0                     \n"
-    ".p2align   2                              \n"
   "1:                                          \n"
     MEMACCESS(0)
-    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    "ld1     {v0.8b}, [%0], %4                 \n"  // Read 4 2x2 blocks -> 2x1
     MEMACCESS(1)
-    "vld1.8     {d1}, [%1], r12                \n"
+    "ld1     {v1.8b}, [%1], %4                 \n"
     MEMACCESS(0)
-    "vld1.8     {d2}, [%0], r12                \n"
+    "ld1     {v2.8b}, [%0], %4                 \n"
     MEMACCESS(1)
-    "vld1.8     {d3}, [%1], r12                \n"
+    "ld1     {v3.8b}, [%1], %4                 \n"
     MEMACCESS(0)
-    "vld1.8     {d4}, [%0], r12                \n"
+    "ld1     {v4.8b}, [%0], %4                 \n"
     MEMACCESS(1)
-    "vld1.8     {d5}, [%1], r12                \n"
+    "ld1     {v5.8b}, [%1], %4                 \n"
     MEMACCESS(0)
-    "vld1.8     {d6}, [%0], r12                \n"
+    "ld1     {v6.8b}, [%0], %4                 \n"
     MEMACCESS(1)
-    "vld1.8     {d7}, [%1], r12                \n"
-    "vaddl.u8   q0, d0, d1                     \n"
-    "vaddl.u8   q1, d2, d3                     \n"
-    "vaddl.u8   q2, d4, d5                     \n"
-    "vaddl.u8   q3, d6, d7                     \n"
-    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
-    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
-    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
-    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "ld1     {v7.8b}, [%1], %4                 \n"
+    "uaddl   v0.8h, v0.8b, v1.8b               \n"
+    "uaddl   v2.8h, v2.8b, v3.8b               \n"
+    "uaddl   v4.8h, v4.8b, v5.8b               \n"
+    "uaddl   v6.8h, v6.8b, v7.8b               \n"
+    "mov     v16.d[1], v0.d[1]                 \n"  // ab_cd -> ac_bd
+    "mov     v0.d[1], v2.d[0]                  \n"
+    "mov     v2.d[0], v16.d[1]                 \n"
+    "mov     v16.d[1], v4.d[1]                 \n"  // ef_gh -> eg_fh
+    "mov     v4.d[1], v6.d[0]                  \n"
+    "mov     v6.d[0], v16.d[1]                 \n"
+    "add     v0.8h, v0.8h, v2.8h               \n"  // (a+b)_(c+d)
+    "add     v4.8h, v4.8h, v6.8h               \n"  // (e+f)_(g+h)
+    "rshrn   v0.8b, v0.8h, #2                  \n"  // first 2 pixels.
+    "rshrn2  v0.16b, v4.8h, #2                 \n"  // next 2 pixels.
     "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
     MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
+    "st1     {v0.16b}, [%2], #16               \n"
+    "b.gt       1b                             \n"
   : "+r"(src_argb),    // %0
     "+r"(src_stride),  // %1
     "+r"(dst_argb),    // %2
     "+r"(dst_width)    // %3
-  : "r"(src_stepx)     // %4
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  : "r"(src_stepx * 4) // %4
+  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
   );
 }
-#endif  // HAS_SCALEARGBROWDOWNEVEN_NEON
-#endif  // __aarch64__
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc
index 352e6678221..bb6e57efe32 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_posix.cc
@@ -101,110 +101,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
@@ -218,17 +114,12 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
@@ -236,7 +127,7 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
     LABELALIGN
   "1:                                          \n"
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "psrlw     $0x8,%%xmm0                     \n"
@@ -254,17 +145,12 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
 
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
@@ -274,7 +160,6 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    BUNDLEALIGN
     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
@@ -296,13 +181,8 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
   );
 }
 
@@ -315,8 +195,8 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
@@ -330,11 +210,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
   );
 }
 
@@ -348,18 +224,16 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
-    MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3
-    MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4
-    MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5
+    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
+    MEMOPREG(movdqu,0x00,0,3,1,xmm4)           //  movdqu  (%0,%3,1),%%xmm4
+    MEMOPREG(movdqu,0x10,0,3,1,xmm5)           //  movdqu  0x10(%0,%3,1),%%xmm5
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm4,%%xmm2                   \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
@@ -388,13 +262,8 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     "+r"(dst_width),   // %2
     "+r"(stridex3)     // %3
   : "r"((intptr_t)(src_stride))    // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
   );
 }
 
@@ -412,8 +281,8 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "movdqa    %%xmm2,%%xmm1                   \n"
     "palignr   $0x8,%%xmm0,%%xmm1              \n"
@@ -429,11 +298,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   : "+r"(src_ptr),   // %0
     "+r"(dst_ptr),   // %1
     "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
@@ -461,8 +326,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
     "pshufb    %%xmm2,%%xmm6                   \n"
     "pmaddubsw %%xmm5,%%xmm6                   \n"
@@ -479,9 +344,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     "psrlw     $0x2,%%xmm6                     \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm7,%%xmm6                   \n"
     "pshufb    %%xmm4,%%xmm6                   \n"
@@ -498,13 +362,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     "+r"(dst_width)  // %2
   : "r"((intptr_t)(src_stride)),  // %3
     "m"(kMadd21)     // %4
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -533,8 +392,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
+    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
     "pavgb     %%xmm6,%%xmm7                   \n"
     "pavgb     %%xmm7,%%xmm6                   \n"
     "pshufb    %%xmm2,%%xmm6                   \n"
@@ -553,8 +412,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     "psrlw     $0x2,%%xmm6                     \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm6,%%xmm7                   \n"
     "pavgb     %%xmm7,%%xmm6                   \n"
@@ -572,13 +431,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
       "+r"(dst_width)  // %2
     : "r"((intptr_t)(src_stride)),  // %3
       "m"(kMadd21)     // %4
-    : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+    : "memory", "cc", NACL_R14
+      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -590,8 +444,8 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
@@ -607,10 +461,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
     "+r"(dst_width)  // %2
   : "m"(kShuf38a),   // %3
     "m"(kShuf38b)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-      , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
   );
 }
 
@@ -631,9 +482,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
     "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pshufb    %%xmm2,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm6                   \n"
@@ -643,23 +495,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
     "paddusw   %%xmm0,%%xmm1                   \n"
     "pmulhuw   %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "sub       $0x6,%2                         \n"
     "movd      %%xmm1," MEMACCESS(1) "         \n"
     "psrlq     $0x10,%%xmm1                    \n"
     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
     "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_ptr),     // %0
     "+r"(dst_ptr),     // %1
     "+r"(dst_width)    // %2
   : "r"((intptr_t)(src_stride))  // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -679,8 +526,8 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
     "movhlps   %%xmm0,%%xmm1                   \n"
     "movhlps   %%xmm6,%%xmm7                   \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
@@ -689,7 +536,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     "punpcklbw %%xmm5,%%xmm7                   \n"
     "paddusw   %%xmm6,%%xmm0                   \n"
     "paddusw   %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6
+    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
     "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movhlps   %%xmm6,%%xmm7                   \n"
     "punpcklbw %%xmm5,%%xmm6                   \n"
@@ -711,23 +558,18 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     "paddusw   %%xmm7,%%xmm6                   \n"
     "pmulhuw   %%xmm4,%%xmm6                   \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
-    "sub       $0x6,%2                         \n"
     "movd      %%xmm6," MEMACCESS(1) "         \n"
     "psrlq     $0x10,%%xmm6                    \n"
     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
     "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "sub       $0x6,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_ptr),    // %0
     "+r"(dst_ptr),    // %1
     "+r"(dst_width)   // %2
   : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
 
@@ -741,7 +583,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "mov       %0,%3                           \n"
     "add       %6,%0                           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
@@ -753,7 +595,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 
     LABELALIGN
   "2:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
     "add       %6,%0                           \n"
     "movdqa    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm2                   \n"
@@ -765,8 +607,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 
     LABELALIGN
   "3:                                          \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "lea       " MEMLEA(0x10,3) ",%0           \n"
     "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x10,%4                        \n"
@@ -778,10 +620,7 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     "+r"(src_width),   // %4
     "+rm"(src_height)  // %5
   : "rm"((intptr_t)(src_stride))  // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   );
 }
 
@@ -813,7 +652,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
     "movd      %k2,%%xmm0                      \n"
     "psrlw     $0x9,%%xmm1                     \n"
-    BUNDLEALIGN
     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
     "movd      %k2,%%xmm4                      \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
@@ -853,13 +691,8 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "+rm"(dst_width)   // %5
   : "rm"(x),           // %6
     "rm"(dx)           // %7
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
@@ -870,25 +703,21 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
-    "sub       $0x20,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x20,%2                         \n"
     "jg        1b                              \n"
 
   : "+r"(dst_ptr),     // %0
     "+r"(src_ptr),     // %1
     "+r"(dst_width)    // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
@@ -898,22 +727,18 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
@@ -923,25 +748,21 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
     "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", "xmm0", "xmm1"
   );
 }
 
@@ -951,11 +772,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    BUNDLEALIGN
-    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
-    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
     "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
@@ -963,21 +783,16 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
     "+r"(dst_width)   // %2
   : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 
@@ -996,29 +811,22 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
     "movd      " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
     "punpckldq %%xmm1,%%xmm0                   \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
     "punpckldq %%xmm3,%%xmm2                   \n"
     "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),      // %0
     "+r"(src_stepx_x4),  // %1
     "+r"(dst_argb),      // %2
     "+r"(dst_width),     // %3
     "+r"(src_stepx_x12)  // %4
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 
@@ -1040,11 +848,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     "movq      " MEMACCESS(0) ",%%xmm0         \n"
     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
-    BUNDLEALIGN
     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
     "movq      " MEMACCESS(5) ",%%xmm2         \n"
-    BUNDLEALIGN
     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
@@ -1055,9 +861,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
     "jg        1b                              \n"
   : "+r"(src_argb),       // %0
     "+r"(src_stepx_x4),   // %1
@@ -1065,14 +871,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     "+rm"(dst_width),     // %3
     "+r"(src_stepx_x12),  // %4
     "+r"(row1)            // %5
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3"
   );
 }
 
@@ -1111,15 +911,14 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     "pextrw    $0x3,%%xmm2,%k1                 \n"
     "punpckldq %%xmm4,%%xmm1                   \n"
     "punpcklqdq %%xmm1,%%xmm0                  \n"
-    "sub       $0x4,%4                         \n"
     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
     "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%4                         \n"
     "jge       40b                             \n"
 
   "49:                                         \n"
     "test      $0x2,%4                         \n"
     "je        29f                             \n"
-    BUNDLEALIGN
     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
     "pextrw    $0x5,%%xmm2,%k0                 \n"
@@ -1139,13 +938,8 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     "+r"(dst_width)    // %4
   : "rm"(x),           // %5
     "rm"(dx)           // %6
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
   );
 }
 
@@ -1156,28 +950,22 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
   asm volatile (
     LABELALIGN
   "1:                                          \n"
-    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     "lea       " MEMLEA(0x10,1) ",%1           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpckldq %%xmm0,%%xmm0                   \n"
     "punpckhdq %%xmm1,%%xmm1                   \n"
-    "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
     "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
 
   : "+r"(dst_argb),    // %0
     "+r"(src_argb),    // %1
     "+r"(dst_width)    // %2
-  :
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
+  :: "memory", "cc", NACL_R14
+    "xmm0", "xmm1"
   );
 }
 
@@ -1225,7 +1013,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "paddd     %%xmm3,%%xmm2                   \n"
     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
     "psrlw     $0x9,%%xmm1                     \n"
-    BUNDLEALIGN
     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
     "pshufb    %%xmm5,%%xmm1                   \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
@@ -1245,7 +1032,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "add       $0x1,%2                         \n"
     "jl        99f                             \n"
     "psrlw     $0x9,%%xmm2                     \n"
-    BUNDLEALIGN
     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
     "pshufb    %%xmm5,%%xmm2                   \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
@@ -1264,13 +1050,8 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     "+r"(x1)           // %4
   : "rm"(x),           // %5
     "rm"(dx)           // %6
-  : "memory", "cc"
-#if defined(__native_client__) && defined(__x86_64__)
-    , "r14"
-#endif
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
+  : "memory", "cc", NACL_R14
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc
index 840b9738da5..e0209cdec8c 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/scale_win.cc
@@ -103,118 +103,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     mov        edx, [esp + 12]       // dst_ptr
     mov        ecx, [esp + 16]       // dst_width
 
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x1 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      4
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-    align      4
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -222,9 +110,9 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
     psrlw      xmm0, 8               // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         wloop
 
     ret
@@ -234,9 +122,8 @@ void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
 // Blends 32x1 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
-void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
-                                        ptrdiff_t src_stride,
-                                        uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
   __asm {
     mov        eax, [esp + 4]        // src_ptr
                                      // src_stride
@@ -245,7 +132,6 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      4
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -261,9 +147,9 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
     pavgw      xmm1, xmm3
     packuswb   xmm0, xmm1
 
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         wloop
 
     ret
@@ -273,9 +159,8 @@ void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
 // Blends 32x2 rectangle to 16x1.
 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
 __declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width) {
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]    // src_ptr
@@ -285,7 +170,6 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      4
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -305,9 +189,9 @@ void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
     pavgw      xmm1, xmm3
     packuswb   xmm0, xmm1
 
-    sub        ecx, 16
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 16
     jg         wloop
 
     pop        esi
@@ -329,19 +213,18 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     psrld      xmm5, 24
     pslld      xmm5, 16
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     pand       xmm0, xmm5
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     psrlw      xmm0, 8
     packuswb   xmm0, xmm0
-    sub        ecx, 8
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
+    sub        ecx, 8
     jg         wloop
 
     ret
@@ -364,18 +247,17 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     psrlw      xmm7, 8
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
     pavgb      xmm0, xmm2            // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, [eax + esi * 2]
-    movdqa     xmm3, [eax + esi * 2 + 16]
-    movdqa     xmm4, [eax + edi]
-    movdqa     xmm5, [eax + edi + 16]
+    movdqu     xmm2, [eax + esi * 2]
+    movdqu     xmm3, [eax + esi * 2 + 16]
+    movdqu     xmm4, [eax + edi]
+    movdqu     xmm5, [eax + edi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm2, xmm4
     pavgb      xmm3, xmm5
@@ -398,9 +280,9 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     pavgw      xmm0, xmm2
     packuswb   xmm0, xmm0
 
-    sub        ecx, 8
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
+    sub        ecx, 8
     jg         wloop
 
     pop        edi
@@ -427,10 +309,9 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
     movdqa     xmm4, kShuf1
     movdqa     xmm5, kShuf2
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm1
     palignr    xmm1, xmm0, 8
@@ -481,10 +362,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm6, kMadd11
     movdqa     xmm7, kRound34
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
     pmaddubsw  xmm0, xmm5
@@ -501,8 +381,8 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm4
@@ -511,9 +391,9 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    sub        ecx, 24
     movq       qword ptr [edx + 16], xmm0
     lea        edx, [edx + 24]
+    sub        ecx, 24
     jg         wloop
 
     pop        esi
@@ -540,10 +420,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm6, kMadd11
     movdqa     xmm7, kRound34
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
+    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm1, [eax + esi]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
@@ -562,8 +441,8 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
+    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -573,9 +452,9 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     paddsw     xmm0, xmm7
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
-    sub        ecx, 24
     movq       qword ptr [edx + 16], xmm0
     lea        edx, [edx+24]
+    sub        ecx, 24
     jg         wloop
 
     pop        esi
@@ -597,20 +476,19 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
     movdqa     xmm4, kShuf38a
     movdqa     xmm5, kShuf38b
 
-    align      4
   xloop:
-    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm4
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    sub        ecx, 12
     movq       qword ptr [edx], xmm0  // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
+    sub        ecx, 12
     jg         xloop
 
     ret
@@ -633,10 +511,9 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm4, kScaleAc33
     pxor       xmm5, xmm5
 
-    align      4
   xloop:
-    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
-    movdqa     xmm6, [eax + esi]
+    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm6, [eax + esi]
     movhlps    xmm1, xmm0
     movhlps    xmm7, xmm6
     punpcklbw  xmm0, xmm5
@@ -645,7 +522,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     punpcklbw  xmm7, xmm5
     paddusw    xmm0, xmm6
     paddusw    xmm1, xmm7
-    movdqa     xmm6, [eax + esi * 2]
+    movdqu     xmm6, [eax + esi * 2]
     lea        eax, [eax + 16]
     movhlps    xmm7, xmm6
     punpcklbw  xmm6, xmm5
@@ -671,11 +548,11 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
     packuswb   xmm6, xmm6
 
-    sub        ecx, 6
     movd       [edx], xmm6           // write 6 pixels
     psrlq      xmm6, 16
     movd       [edx + 2], xmm6
     lea        edx, [edx + 6]
+    sub        ecx, 6
     jg         xloop
 
     pop        esi
@@ -699,11 +576,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm4, kShufAb2
     movdqa     xmm5, kScaleAb2
 
-    align      4
   xloop:
-    movdqa     xmm0, [eax]           // average 2 rows into xmm0
-    pavgb      xmm0, [eax + esi]
+    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm1, [eax + esi]
     lea        eax, [eax + 16]
+    pavgb      xmm0, xmm1
 
     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
     pshufb     xmm1, xmm2
@@ -716,11 +593,11 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
     packuswb   xmm1, xmm1
 
-    sub        ecx, 6
     movd       [edx], xmm1           // write 6 pixels
     psrlq      xmm1, 16
     movd       [edx + 2], xmm1
     lea        edx, [edx + 6]
+    sub        ecx, 6
     jg         xloop
 
     pop        esi
@@ -747,10 +624,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     pxor       xmm4, xmm4
     dec        ebx
 
-    align      4
   xloop:
     // first row
-    movdqa     xmm0, [esi]
+    movdqu     xmm0, [esi]
     lea        eax, [esi + edx]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm4
@@ -761,9 +637,8 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     je         ydone
 
     // sum remaining rows
-    align      4
   yloop:
-    movdqa     xmm2, [eax]       // read 16 pixels
+    movdqu     xmm2, [eax]       // read 16 pixels
     lea        eax, [eax + edx]  // advance to next row
     movdqa     xmm3, xmm2
     punpcklbw  xmm2, xmm4
@@ -773,10 +648,9 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
     sub        ebp, 1
     jg         yloop
 
-    align      4
   ydone:
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm1
+    movdqu     [edi], xmm0
+    movdqu     [edi + 16], xmm1
     lea        edi, [edi + 32]
 
     sub        ecx, 16
@@ -828,7 +702,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     pextrw     edx, xmm2, 3         // get x1 integer. preroll
 
     // 2 Pixel loop.
-    align      4
   xloop2:
     movdqa     xmm1, xmm2           // x0, x1 fractions.
     paddd      xmm2, xmm3           // x += dx
@@ -851,7 +724,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     sub        ecx, 2               // 2 pixels
     jge        xloop2
 
-    align      4
  xloop29:
 
     add        ecx, 2 - 1
@@ -869,7 +741,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     movd       ebx, xmm0
     mov        [edi], bl
 
-    align      4
  xloop99:
 
     pop        edi
@@ -889,17 +760,16 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     mov        eax, [esp + 8]    // src_ptr
     mov        ecx, [esp + 12]   // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
     lea        eax,  [eax + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm0
     punpckhbw  xmm1, xmm1
-    sub        ecx, 32
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 32
     jg         wloop
 
     ret
@@ -918,15 +788,14 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     shufps     xmm0, xmm1, 0xdd
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     ret
@@ -945,18 +814,17 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
     mov        edx, [esp + 12]       // dst_argb
     mov        ecx, [esp + 16]       // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     ret
@@ -976,12 +844,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
     mov        edx, [esp + 4 + 12]   // dst_argb
     mov        ecx, [esp + 4 + 16]   // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2            // average rows
     pavgb      xmm1, xmm3
@@ -989,9 +856,9 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     pop        esi
@@ -1016,7 +883,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
-    align      4
   wloop:
     movd       xmm0, [eax]
     movd       xmm1, [eax + ebx]
@@ -1026,9 +892,9 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
     lea        eax,  [eax + ebx * 4]
     punpckldq  xmm2, xmm3
     punpcklqdq xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     pop        edi
@@ -1057,7 +923,6 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
-    align      4
   wloop:
     movq       xmm0, qword ptr [eax]  // row0 4 pairs
     movhps     xmm0, qword ptr [eax + ebx]
@@ -1075,9 +940,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     shufps     xmm0, xmm1, 0x88      // even pixels
     shufps     xmm2, xmm1, 0xdd      // odd pixels
     pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
+    movdqu     [edx], xmm0
     lea        edx, [edx + 16]
+    sub        ecx, 4
     jg         wloop
 
     pop        edi
@@ -1118,7 +983,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     jl         xloop49
 
     // 4 Pixel loop.
-    align      4
  xloop4:
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
@@ -1133,12 +997,11 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
     punpckldq  xmm1, xmm4             // x2 x3
     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
-    sub        ecx, 4                 // 4 pixels
     movdqu     [edi], xmm0
     lea        edi, [edi + 16]
+    sub        ecx, 4                 // 4 pixels
     jge        xloop4
 
-    align      4
  xloop49:
     test       ecx, 2
     je         xloop29
@@ -1159,7 +1022,6 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     // 1 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
     movd       dword ptr [edi], xmm0
-    align      4
  xloop99:
 
     pop        esi
@@ -1209,7 +1071,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     pextrw     edx, xmm2, 3         // get x1 integer. preroll
 
     // 2 Pixel loop.
-    align      4
   xloop2:
     movdqa     xmm1, xmm2           // x0, x1 fractions.
     paddd      xmm2, xmm3           // x += dx
@@ -1229,7 +1090,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     sub        ecx, 2               // 2 pixels
     jge        xloop2
 
-    align      4
  xloop29:
 
     add        ecx, 2 - 1
@@ -1246,7 +1106,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
     movd       [edi], xmm0
 
-    align      4
  xloop99:
 
     pop        edi
@@ -1265,17 +1124,16 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
     mov        eax, [esp + 8]    // src_argb
     mov        ecx, [esp + 12]   // dst_width
 
-    align      4
   wloop:
-    movdqa     xmm0, [eax]
+    movdqu     xmm0, [eax]
     lea        eax,  [eax + 16]
     movdqa     xmm1, xmm0
     punpckldq  xmm0, xmm0
     punpckhdq  xmm1, xmm1
-    sub        ecx, 8
-    movdqa     [edx], xmm0
-    movdqa     [edx + 16], xmm1
+    movdqu     [edx], xmm0
+    movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
+    sub        ecx, 8
     jg         wloop
 
     ret
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc
index efbedf46e2b..379a0669ae6 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/libyuv/source/video_common.cc
@@ -33,7 +33,7 @@ static const struct FourCCAliasEntry kFourCCAliases[] = {
   {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
   {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
   {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},
+  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
   {FOURCC_RGB3, FOURCC_RAW },
   {FOURCC_BGR3, FOURCC_24BG},
   {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx
index 02cd9ab4edc..343bcf9624b 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/README.libvpx
@@ -9,3 +9,4 @@ defines that help automatically allow assembly to work cross-platform.
 
 Local Modifications:
 Some modifications to allow PIC to work with x86inc.
+Conditionally define program_name to allow overriding.
diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm
index 99453a99854..bc8116995dd 100644
--- a/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm
+++ b/chromium/third_party/libvpx/source/libvpx/third_party/x86inc/x86inc.asm
@@ -36,7 +36,9 @@
 
 %include "vpx_config.asm"
 
+%ifndef program_name
 %define program_name vp9
+%endif
 
 
 %define UNIX64 0
@@ -78,6 +80,9 @@
 %macro SECTION_RODATA 0-1 16
     %ifidn __OUTPUT_FORMAT__,macho64
         SECTION .text align=%1
+    %elifidn __OUTPUT_FORMAT__,macho32
+        SECTION .text align=%1
+        fakegot:
     %elifidn __OUTPUT_FORMAT__,macho
         SECTION .text align=%1
         fakegot:
@@ -617,9 +622,17 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
         %elifidn __OUTPUT_FORMAT__,elf64
             global %1:function hidden
         %elifidn __OUTPUT_FORMAT__,macho32
-            global %1:private_extern
+            %ifdef __NASM_VER__
+                global %1
+            %else
+                global %1:private_extern
+            %endif
         %elifidn __OUTPUT_FORMAT__,macho64
-            global %1:private_extern
+            %ifdef __NASM_VER__
+                global %1
+            %else
+                global %1:private_extern
+            %endif
         %else
             global %1
         %endif
diff --git a/chromium/third_party/libvpx/source/libvpx/usage.dox b/chromium/third_party/libvpx/source/libvpx/usage.dox
index 237b8dc42bf..88235202d17 100644
--- a/chromium/third_party/libvpx/source/libvpx/usage.dox
+++ b/chromium/third_party/libvpx/source/libvpx/usage.dox
@@ -12,13 +12,13 @@
     - \ref usage_init
     - \ref usage_errors
 
-    Fore more information on decoder and encoder specific usage, see the
+    For more information on decoder and encoder specific usage, see the
     following pages:
     \if decoder
-    - \subpage usage_decode
+    \li \subpage usage_decode
     \endif
-    \if decoder
-    - \subpage usage_encode
+    \if encoder
+    \li \subpage usage_encode
     \endif
 
     \section usage_types Important Data Types
@@ -80,10 +80,13 @@
 
 
     The available initialization methods are:
-    \if encoder - #vpx_codec_enc_init (calls vpx_codec_enc_init_ver()) \endif
-    \if multi-encoder - #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver()) \endif
-    \if decoder - #vpx_codec_dec_init (calls vpx_codec_dec_init_ver()) \endif
-
+    \if encoder
+    \li #vpx_codec_enc_init (calls vpx_codec_enc_init_ver())
+    \li #vpx_codec_enc_init_multi (calls vpx_codec_enc_init_multi_ver())
+    \endif
+    \if decoder
+    \li #vpx_codec_dec_init (calls vpx_codec_dec_init_ver())
+    \endif
 
 
     \section usage_errors Error Handling
diff --git a/chromium/third_party/libvpx/source/libvpx/usage_cx.dox b/chromium/third_party/libvpx/source/libvpx/usage_cx.dox
index 62f3e450b0e..92b0d34ef4d 100644
--- a/chromium/third_party/libvpx/source/libvpx/usage_cx.dox
+++ b/chromium/third_party/libvpx/source/libvpx/usage_cx.dox
@@ -1,4 +1,4 @@
-/*! \page usage_encode Encode
+/*! \page usage_encode Encoding
 
     The vpx_codec_encode() function is at the core of the encode loop. It
     processes raw images passed by the application, producing packets of
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c
index 54afc13355a..b9d875a2ff7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/alloccommon.c
@@ -103,9 +103,9 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
         goto allocation_fail;
 
     oci->post_proc_buffer_int_used = 0;
-    vpx_memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
-    vpx_memset(oci->post_proc_buffer.buffer_alloc, 128,
-               oci->post_proc_buffer.frame_size);
+    memset(&oci->postproc_state, 0, sizeof(oci->postproc_state));
+    memset(oci->post_proc_buffer.buffer_alloc, 128,
+           oci->post_proc_buffer.frame_size);
 
     /* Allocate buffer to store post-processing filter coefficients.
      *
@@ -176,7 +176,7 @@ void vp8_create_common(VP8_COMMON *oci)
     oci->clamp_type = RECON_CLAMP_REQUIRED;
 
     /* Initialize reference frame sign bias structure to defaults */
-    vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
+    memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
 
     /* Default disable buffer to buffer copying */
     oci->copy_buffer_to_gf = 0;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
index 2510ad83835..db48ded5827 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/dequant_idct_v6.asm
@@ -165,7 +165,7 @@ vp8_dequant_idct_loop2_v6
     str     r1, [r2], r12           ; store output to dst
     bne     vp8_dequant_idct_loop2_v6
 
-; vpx_memset
+; memset
     sub     r0, r0, #32
     add     sp, sp, #4
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c
index 7fe39674eb6..d6a6781d862 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/filter_arm.c
@@ -99,7 +99,7 @@ void vp8_sixtap_predict4x4_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 12*4); /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[12*4]); /* Temp data buffer used in filtering */
 
 
     HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
@@ -147,7 +147,7 @@ void vp8_sixtap_predict8x8_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 16*8); /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[16*8]); /* Temp data buffer used in filtering */
 
     HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
     VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
@@ -189,7 +189,7 @@ void vp8_sixtap_predict16x16_armv6
 {
     const short  *HFilter;
     const short  *VFilter;
-    DECLARE_ALIGNED_ARRAY(4, short, FData, 24*16);    /* Temp data buffer used in filtering */
+    DECLARE_ALIGNED(4, short, FData[24*16]);    /* Temp data buffer used in filtering */
 
     HFilter = vp8_sub_pel_filters[xoffset];   /* 6 tap */
     VFilter = vp8_sub_pel_filters[yoffset];   /* 6 tap */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c
deleted file mode 100644
index 6595ac0519b..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/sad_neon.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-unsigned int vp8_sad8x8_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 7; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad8x16_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 15; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad4x4_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x8_t d0, d8;
-    uint16x8_t q12;
-    uint32x2_t d1;
-    uint64x1_t d3;
-    int i;
-
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(d0, d8);
-
-    for (i = 0; i < 3; i++) {
-        d0 = vld1_u8(src_ptr);
-        src_ptr += src_stride;
-        d8 = vld1_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, d0, d8);
-    }
-
-    d1 = vpaddl_u16(vget_low_u16(q12));
-    d3 = vpaddl_u32(d1);
-
-    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
-}
-
-unsigned int vp8_sad16x16_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x16_t q0, q4;
-    uint16x8_t q12, q13;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-    for (i = 0; i < 15; i++) {
-        q0 = vld1q_u8(src_ptr);
-        src_ptr += src_stride;
-        q4 = vld1q_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-    }
-
-    q12 = vaddq_u16(q12, q13);
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
-
-unsigned int vp8_sad16x8_neon(
-        unsigned char *src_ptr,
-        int src_stride,
-        unsigned char *ref_ptr,
-        int ref_stride) {
-    uint8x16_t q0, q4;
-    uint16x8_t q12, q13;
-    uint32x4_t q1;
-    uint64x2_t q3;
-    uint32x2_t d5;
-    int i;
-
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-    for (i = 0; i < 7; i++) {
-        q0 = vld1q_u8(src_ptr);
-        src_ptr += src_stride;
-        q4 = vld1q_u8(ref_ptr);
-        ref_ptr += ref_stride;
-        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-    }
-
-    q12 = vaddq_u16(q12, q13);
-    q1 = vpaddlq_u16(q12);
-    q3 = vpaddlq_u32(q1);
-    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                  vreinterpret_u32_u64(vget_high_u64(q3)));
-
-    return vget_lane_u32(d5, 0);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
index 8308d555b37..974d3b6532b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
@@ -32,7 +32,7 @@ unsigned int vp8_sub_pixel_variance16x16_neon_func(
         int dst_pixels_per_line,
         unsigned int *sse) {
     int i;
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
+    DECLARE_ALIGNED(16, unsigned char, tmp[528]);
     unsigned char *tmpp;
     unsigned char *tmpp2;
     uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
@@ -911,12 +911,6 @@ unsigned int vp8_variance_halfpixvar16x16_hv_neon(
     return vget_lane_u32(d0u32, 0);
 }
 
-enum { kWidth8 = 8 };
-enum { kHeight8 = 8 };
-enum { kHeight8PlusOne = 9 };
-enum { kPixelStepOne = 1 };
-enum { kAlign16 = 16 };
-
 #define FILTER_BITS 7
 
 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
@@ -968,8 +962,8 @@ static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
                                      const uint8_t *b, int b_stride,
                                      unsigned int *sse) {
   int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) / (8 * 8));
 }
 
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -1003,22 +997,21 @@ unsigned int vp8_sub_pixel_variance8x8_neon(
         const unsigned char *dst,
         int dst_stride,
         unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+  DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
   if (xoffset == 0) {
-    var_filter_block2d_bil_w8(src, temp2, src_stride, kWidth8, kHeight8,
-                              kWidth8, bilinear_taps_coeff[yoffset]);
+    var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,
+                              8, bilinear_taps_coeff[yoffset]);
   } else if (yoffset == 0) {
-    var_filter_block2d_bil_w8(src, temp2, src_stride, kPixelStepOne,
-                              kHeight8PlusOne, kWidth8,
+    var_filter_block2d_bil_w8(src, temp2, src_stride, 1,
+                              9, 8,
                               bilinear_taps_coeff[xoffset]);
   } else {
-    var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
-                              kHeight8PlusOne, kWidth8,
+    var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+                              9, 8,
                               bilinear_taps_coeff[xoffset]);
-    var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
-                              kWidth8, bilinear_taps_coeff[yoffset]);
+    var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+                              8, bilinear_taps_coeff[yoffset]);
   }
-  return variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+  return variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
index ea1a6a4adfd..192108a06db 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/blockd.h
@@ -187,8 +187,12 @@ typedef struct
 {
     FRAME_TYPE frame_type;
     int is_frame_dropped;
+    // The frame rate for the lowest resolution.
+    double low_res_framerate;
     /* The frame number of each reference frames */
     unsigned int low_res_ref_frames[MAX_REF_FRAMES];
+    // The video frame counter value for the key frame, for lowest resolution.
+    unsigned int key_frame_counter_value;
     LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
index 17262d6983c..ba3d9f54d1e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/common.h
@@ -29,19 +29,19 @@ extern "C" {
 
 #define vp8_copy( Dest, Src) { \
         assert( sizeof( Dest) == sizeof( Src)); \
-        vpx_memcpy( Dest, Src, sizeof( Src)); \
+        memcpy( Dest, Src, sizeof( Src)); \
     }
 
 /* Use this for variably-sized arrays. */
 
 #define vp8_copy_array( Dest, Src, N) { \
         assert( sizeof( *Dest) == sizeof( *Src)); \
-        vpx_memcpy( Dest, Src, N * sizeof( *Src)); \
+        memcpy( Dest, Src, N * sizeof( *Src)); \
     }
 
-#define vp8_zero( Dest)  vpx_memset( &Dest, 0, sizeof( Dest));
+#define vp8_zero( Dest)  memset( &Dest, 0, sizeof( Dest));
 
-#define vp8_zero_array( Dest, N)  vpx_memset( Dest, 0, N * sizeof( *Dest));
+#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *Dest));
 
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c
new file mode 100644
index 00000000000..fd96c863491
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/copy_c.c
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+/* Copy 2 macroblocks to a buffer */
+void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
+                    unsigned char *dst_ptr, int dst_stride,
+                    int height)
+{
+    int r;
+
+    for (r = 0; r < height; r++)
+    {
+        memcpy(dst_ptr, src_ptr, 32);
+
+        src_ptr += src_stride;
+        dst_ptr += dst_stride;
+
+    }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c
index 46064e61d53..159fddc6a76 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/debugmodes.c
@@ -81,7 +81,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
     fprintf(mvs, "\n");
 
     /* print out the block modes */
-    mb_index = 0;
     fprintf(mvs, "Mbs for Frame %d\n", frame);
     {
         int b_row;
@@ -129,7 +128,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
 
 
     /* print out the block modes */
-    mb_index = 0;
     fprintf(mvs, "MVs for Frame %d\n", frame);
     {
         int b_row;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c
index 6e2f69a773e..f8b04fa4ee5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/dequantize.c
@@ -38,6 +38,6 @@ void vp8_dequant_idct_add_c(short *input, short *dq,
 
     vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
 
-    vpx_memset(input, 0, 32);
+    memset(input, 0, 32);
 
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c
index 8c046a4f57c..c00e565f063 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropy.c
@@ -183,7 +183,6 @@ const vp8_extra_bit_struct vp8_extra_bits[12] =
 
 void vp8_default_coef_probs(VP8_COMMON *pc)
 {
-    vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
-                   sizeof(default_coef_probs));
+    memcpy(pc->fc.coef_probs, default_coef_probs, sizeof(default_coef_probs));
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c
index 091e4c732b0..8981a8d3c2a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/entropymode.c
@@ -159,13 +159,13 @@ const vp8_tree_index vp8_small_mvtree [14] =
 
 void vp8_init_mbmode_probs(VP8_COMMON *x)
 {
-    vpx_memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
-    vpx_memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
-    vpx_memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
+    memcpy(x->fc.ymode_prob, vp8_ymode_prob, sizeof(vp8_ymode_prob));
+    memcpy(x->fc.uv_mode_prob, vp8_uv_mode_prob, sizeof(vp8_uv_mode_prob));
+    memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
 }
 
 void vp8_default_bmode_probs(vp8_prob p [VP8_BINTRAMODES-1])
 {
-    vpx_memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+    memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c
index c9bdd21897d..2d938ad7825 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/extend.c
@@ -40,9 +40,9 @@ static void copy_and_extend_plane
 
     for (i = 0; i < h; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], el);
-        vpx_memcpy(dest_ptr1 + el, src_ptr1, w);
-        vpx_memset(dest_ptr2, src_ptr2[0], er);
+        memset(dest_ptr1, src_ptr1[0], el);
+        memcpy(dest_ptr1 + el, src_ptr1, w);
+        memset(dest_ptr2, src_ptr2[0], er);
         src_ptr1  += sp;
         src_ptr2  += sp;
         dest_ptr1 += dp;
@@ -60,13 +60,13 @@ static void copy_and_extend_plane
 
     for (i = 0; i < et; i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, linesize);
+        memcpy(dest_ptr1, src_ptr1, linesize);
         dest_ptr1 += dp;
     }
 
     for (i = 0; i < eb; i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, linesize);
+        memcpy(dest_ptr2, src_ptr2, linesize);
         dest_ptr2 += dp;
     }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c
index 65d5002c8bc..8aa7d9bf0ff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/idct_blk.c
@@ -33,7 +33,7 @@ void vp8_dequant_idct_add_y_block_c
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q   += 16;
@@ -59,7 +59,7 @@ void vp8_dequant_idct_add_uv_block_c
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q    += 16;
@@ -78,7 +78,7 @@ void vp8_dequant_idct_add_uv_block_c
             else
             {
                 vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
-                vpx_memset(q, 0, 2 * sizeof(q[0]));
+                memset(q, 0, 2 * sizeof(q[0]));
             }
 
             q    += 16;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
index 7a07e76fc41..8b55dff92bf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter.c
@@ -82,11 +82,10 @@ void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
         if (block_inside_limit < 1)
             block_inside_limit = 1;
 
-        vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
-        vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
-                SIMD_WIDTH);
-        vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
-                SIMD_WIDTH);
+        memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
+        memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), SIMD_WIDTH);
+        memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
+               SIMD_WIDTH);
     }
 }
 
@@ -105,7 +104,7 @@ void vp8_loop_filter_init(VP8_COMMON *cm)
     /* init hev threshold const vectors */
     for(i = 0; i < 4 ; i++)
     {
-        vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+        memset(lfi->hev_thr[i], i, SIMD_WIDTH);
     }
 }
 
@@ -151,7 +150,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm,
             /* we could get rid of this if we assume that deltas are set to
              * zero when not in use; encoder always uses deltas
              */
-            vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
+            memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
             continue;
         }
 
@@ -261,6 +260,7 @@ void vp8_loop_filter_row_simple(VP8_COMMON *cm, MODE_INFO *mode_info_context,
     int mb_col;
     int filter_level;
     loop_filter_info_n *lfi_n = &cm->lf_info;
+    (void)post_uvstride;
 
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
     {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c
index 069332660e3..d12dea19364 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mfqe.c
@@ -17,10 +17,11 @@
  * higher quality.
  */
 
-#include "postproc.h"
-#include "variance.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/postproc.h"
+#include "vp8/common/variance.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp8_rtcd.h"
 #include "vpx_scale/yv12config.h"
 
 #include <limits.h>
@@ -153,16 +154,16 @@ static void multiframe_quality_enhance_block
         actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
         act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
-        sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+        vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 128)>>8;
-        usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+        vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 32)>>6;
-        vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+        vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 32)>>6;
 #else
-        sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
-        usad = (vp8_sad8x8(u, uv_stride, ud, uvd_stride, UINT_MAX) + 32) >> 6;
-        vsad = (vp8_sad8x8(v, uv_stride, vd, uvd_stride, UINT_MAX)+ 32) >> 6;
+        sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+        usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
+        vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride)+ 32) >> 6;
 #endif
     }
     else /* if (blksize == 8) */
@@ -170,16 +171,16 @@ static void multiframe_quality_enhance_block
         actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
         act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
-        sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+        vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 32)>>6;
-        usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+        vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 8)>>4;
-        vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+        vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 8)>>4;
 #else
-        sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;
-        usad = (vp8_sad4x4(u, uv_stride, ud, uvd_stride, UINT_MAX) + 8) >> 4;
-        vsad = (vp8_sad4x4(v, uv_stride, vd, uvd_stride, UINT_MAX) + 8) >> 4;
+        sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
+        usad = (vpx_sad4x4(u, uv_stride, ud, uvd_stride) + 8) >> 4;
+        vsad = (vpx_sad4x4(v, uv_stride, vd, uvd_stride) + 8) >> 4;
 #endif
     }
 
@@ -231,9 +232,9 @@ static void multiframe_quality_enhance_block
         {
             vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
             for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
-                vpx_memcpy(udp, up, uvblksize);
+                memcpy(udp, up, uvblksize);
             for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
-                vpx_memcpy(vdp, vp, uvblksize);
+                memcpy(vdp, vp, uvblksize);
         }
     }
 }
@@ -341,8 +342,8 @@ void vp8_multiframe_quality_enhance
                                 for (k = 0; k < 4; ++k, up += show->uv_stride, udp += dest->uv_stride,
                                                         vp += show->uv_stride, vdp += dest->uv_stride)
                                 {
-                                    vpx_memcpy(udp, up, 4);
-                                    vpx_memcpy(vdp, vp, 4);
+                                    memcpy(udp, up, 4);
+                                    memcpy(vdp, vp, 4);
                                 }
                             }
                         }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
index 619ee808d8d..fc3bb8ad9db 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c
@@ -26,7 +26,7 @@ void vp8_dequant_idct_add_dspr2(short *input, short *dq,
 
     vp8_short_idct4x4llm_dspr2(input, dest, stride, dest, stride);
 
-    vpx_memset(input, 0, 32);
+    memset(input, 0, 32);
 
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
index d48c4fe5e3d..f39b675cd50 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h
@@ -122,6 +122,7 @@ extern "C"
         int Sharpness;
         int cpu_used;
         unsigned int rc_max_intra_bitrate_pct;
+        unsigned int screen_content_mode;
 
         /* mode ->
          *(0)=Realtime/Live Encoding. This mode is optimized for realtim
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
index 277f37194a9..266431a3240 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/postproc.c
@@ -355,8 +355,8 @@ void vp8_deblock(VP8_COMMON                 *cm,
                 else
                     mb_ppl = (unsigned char)ppl;
 
-                vpx_memset(ylptr, mb_ppl, 16);
-                vpx_memset(uvlptr, mb_ppl, 8);
+                memset(ylptr, mb_ppl, 16);
+                memset(uvlptr, mb_ppl, 8);
 
                 ylptr += 16;
                 uvlptr += 8;
@@ -403,7 +403,7 @@ void vp8_de_noise(VP8_COMMON                 *cm,
     (void) low_var_thresh;
     (void) flag;
 
-    vpx_memset(limits, (unsigned char)ppl, 16 * mb_cols);
+    memset(limits, (unsigned char)ppl, 16 * mb_cols);
 
     /* TODO: The original code don't filter the 2 outer rows and columns. */
     for (mbr = 0; mbr < mb_rows; mbr++)
@@ -518,6 +518,7 @@ void vp8_plane_add_noise_c(unsigned char *Start, char *noise,
                            unsigned int Width, unsigned int Height, int Pitch)
 {
     unsigned int i, j;
+    (void)bothclamp;
 
     for (i = 0; i < Height; i++)
     {
@@ -762,7 +763,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
             /* insure that postproc is set to all 0's so that post proc
              * doesn't pull random data in from edge
              */
-            vpx_memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
+            memset((&oci->post_proc_buffer_int)->buffer_alloc,128,(&oci->post_proc_buffer)->frame_size);
 
         }
     }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/copy_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/copy_altivec.asm
deleted file mode 100644
index a4ce9158342..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/copy_altivec.asm
+++ /dev/null
@@ -1,47 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;#  but the output will be.  So two reads and a perm
-;#  for the input, but only one store for the output.
-copy_mem16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xe000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-cp_16x16_loop:
-    lvsl    v0,  0, r3          ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v1, v1, v2, v0
-
-    stvx    v1,  0, r5
-
-    add     r3, r3, r4          ;# increment source pointer
-    add     r5, r5, r6          ;# increment destination pointer
-
-    bdnz    cp_16x16_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_altivec.asm
deleted file mode 100644
index 4da2e94f959..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_altivec.asm
+++ /dev/null
@@ -1,1013 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl sixtap_predict_ppc
-    .globl sixtap_predict8x4_ppc
-    .globl sixtap_predict8x8_ppc
-    .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
-    load_c \V0, HFilter, r5, r9, r10
-
-    addi    r5,  r5, 16
-    lvx     \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
-    load_c v0, VFilter, r6, r3, r10
-
-    vspltish v5, 8
-    vspltish v6, 3
-    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v1, v0, 1
-    vspltb  v2, v0, 2
-    vspltb  v3, v0, 3
-    vspltb  v4, v0, 4
-    vspltb  v5, v0, 5
-    vspltb  v0, v0, 0
-.endm
-
-.macro vpre_load
-    Vprolog
-    li      r10,  16
-    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
-    lvx     v11, r10, r9
-    addi    r9,   r9, 32
-    lvx     v12,   0, r9
-    lvx     v13, r10, r9
-    addi    r9,   r9, 32
-    lvx     v14,   0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
-                                ;# (Re,Ro) += (V*T)
-    vmuleub \TMP, \V, \T        ;# trashes v8
-    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
-    vmuloub \TMP, \V, \T
-    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
-    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
-    vadduhm v16, v6, v8
-    vmuloub  v8, \P0, v0
-    vadduhm v17, v6, v8
-    Msum v16, v17, \P2, v2, v8
-    Msum v16, v17, \P3, v3, v8
-    Msum v16, v17, \P5, v5, v8
-
-    vmuleub v18, \P1, v1        ;# 2 negative taps
-    vmuloub v19, \P1, v1
-    Msum v18, v19, \P4, v4, v8
-
-    vsubuhs v16, v16, v18       ;# subtract neg from pos
-    vsubuhs v17, v17, v19
-    vsrh    v16, v16, v7        ;# divide by 128
-    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
-    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
-    vmrglh  v19, v16, v17
-    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
-    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
-    vadduhm v21, v20, v24
-    vmuloub v24, \P0, v13
-    vadduhm v22, v20, v24
-    Msum v21, v22, \P2, v15, v25
-    Msum v21, v22, \P3, v16, v25
-    Msum v21, v22, \P5, v18, v25
-
-    vmuleub v23, \P1, v14       ;# 2 negative taps
-    vmuloub v24, \P1, v14
-    Msum v23, v24, \P4, v17, v25
-
-    vsubuhs v21, v21, v23       ;# subtract neg from pos
-    vsubuhs v22, v22, v24
-    vsrh    v21, v21, v19       ;# divide by 128
-    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
-    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
-    vmrglh  v24, v21, v22
-    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
-    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
-    stvx    \P0, 0, r7
-    add     r7, r7, r8      ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
-    addi    r9,   r9, 16        ;# P5 = newest input row
-    lvx     \P5,   0, r9
-    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
-    luma_v v10, v11, v12, v13, v14, v15
-    luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
-    luma_vtwo
-    luma_v v12, v13, v14, v15, v10, v11
-    luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
-    luma_vfour
-    luma_v v14, v15, v10, v11, v12, v13
-    luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
-    vmsummbm \R, v13, \I, v15
-    vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     \VD,   0, \RS
-    lvx     v20, r10, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
-    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
-    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
-    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
-    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
-    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
-
-    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
-    vsrh    \R, \R, v19
-
-    vpkuhus \R, \R, \R          ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v20,   0, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, v20, v20, v21
-.endm
-    .text
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xff87
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    vertical_only_4x4
-
-    ;# load up horizontal filter
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_4x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_4x4
-
-vertical_only_4x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0, r3, r4, 1
-    Read8x8 v1, r3, r4, 1
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_4x4:
-    load_c   v20, b_hilo_4x4, 0, r9, r10
-    load_c   v21, b_hilo, 0, r9, r10
-
-    ;# reposition input so that it can go through the
-    ;# filtering phase with one pass.
-    vperm   v0, v0, v1, v20     ;# 0 1 x x
-    vperm   v2, v2, v3, v20     ;# 2 3 x x
-    vperm   v4, v4, v5, v20     ;# 4 5 x x
-    vperm   v6, v6, v7, v20     ;# 6 7 x x
-
-    vperm   v0, v0, v2, v21     ;# 0 1 2 3
-    vperm   v4, v4, v6, v21     ;# 4 5 6 7
-
-    vsldoi  v1, v0, v4, 4
-    vsldoi  v2, v0, v4, 8
-    vsldoi  v3, v0, v4, 12
-
-    vsldoi  v5, v4, v8, 4
-
-    load_c   v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
-    stvx    v0, 0, r1
-
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 4(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 8(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 12(r1)
-    stw     r0, 0(r7)
-
-    b       exit_4x4
-
-store_4x4:
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v4, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v5, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x4
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_8x4
-
-second_pass_pre_copy_8x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x4:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-    b       exit_8x4
-
-store_8x4:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x4
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned2_8x4:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;#  register there is no need to loop.  Everything can stay in registers.
-sixtap_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x8
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 1
-    Read8x8 v9, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-    interp_8x8 v9
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x8
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0,  r9, r4, 1
-    Read8x8 v1,  r9, r4, 0
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v10
-    interp_8x8 v11
-    interp_8x8 v12
-
-    b       second_pass_8x8
-
-second_pass_pre_copy_8x8:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-    Read8x8 v9,  r3, r4, 1
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x8:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
-    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
-    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
-    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-    b       exit_8x8
-
-store_8x8:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x8
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-    w_8x8   v8, r7, r0, r8
-    w_8x8   v9, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned2_8x8:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-    vperm   v8, v8, v9, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-    addi    r7, r7, 16
-    stvx    v8, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
-;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
-;#  temporary buffer because the source buffer can't be modified and the buffer
-;#  for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    ;# Three possiblities
-    ;#  1. First filter is null.  Don't use a temp buffer.
-    ;#  2. Second filter is null.  Don't use a temp buffer.
-    ;#  3. Neither are null, use temp buffer.
-
-    ;# First Pass (horizontal edge)
-    ;#  setup pointers for src
-    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
-    ;#  to second pass.  this is based on if x_offset is 0.
-
-    ;# load up horizontal filter
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    load_hfilter v4, v5
-
-    beq-    copy_horizontal_16x21
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v14, b_hperm, 0, r9, r10
-
-    ;# These statements are guessing that there won't be a second pass,
-    ;#  but if there is then inside the bypass they need to be set
-    li      r0, 16              ;# prepare for no vertical filter
-
-    ;# Change the output pointer and pitch to be the actual
-    ;#  desination instead of a temporary buffer.
-    addi    r9, r7, 0
-    addi    r5, r8, 0
-
-    ;# no vertical filter, so write the output from the first pass
-    ;#  directly into the output buffer.
-    beq-    no_vertical_filter_bypass
-
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# setup counter for the number of lines that are going to be filtered
-    li      r0, 21
-
-    ;# use the stack as temporary storage
-    la      r9, 48(r1)
-    li      r5, 16
-
-no_vertical_filter_bypass:
-
-    mtctr   r0
-
-    ;# rounding added in on the multiply
-    vspltisw v10, 8
-    vspltisw v12, 3
-    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v13, 7
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-horizontal_loop_16x16:
-
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-    lvx     v3, r12, r3
-
-    vperm   v8, v1, v2, v15
-    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
-
-    vsldoi  v11, v8, v9, 4
-
-    ;# set 0
-    vmsummbm v6, v4, v8, v12    ;# taps times elements
-    vmsummbm v0, v5, v11, v6
-
-    ;# set 1
-    vsldoi  v10, v8, v9, 1
-    vsldoi  v11, v8, v9, 5
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v1, v5, v11, v6
-
-    ;# set 2
-    vsldoi  v10, v8, v9, 2
-    vsldoi  v11, v8, v9, 6
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v2, v5, v11, v6
-
-    ;# set 3
-    vsldoi  v10, v8, v9, 3
-    vsldoi  v11, v8, v9, 7
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v3, v5, v11, v6
-
-    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
-
-    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
-    vsrh    v1, v1, v13
-
-    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
-    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
-
-    stvx    v0,  0, r9
-    add     r9, r9, r5
-
-    add     r3, r3, r4
-
-    bdnz    horizontal_loop_16x16
-
-    ;# check again to see if vertical filter needs to be done.
-    cmpi    cr0, r6, 0
-    beq     cr0, end_16x16
-
-    ;# yes there is, so go to the second pass
-    b       second_pass_16x16
-
-copy_horizontal_16x21:
-    li      r10, 21
-    mtctr   r10
-
-    li      r10, 16
-
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# this is done above if there is a horizontal filter,
-    ;#  if not it needs to be done down here.
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-    ;# always write to the stack when doing a horizontal copy
-    la      r9, 48(r1)
-
-copy_horizontal_loop_16x21:
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v8, v1, v2, v15
-
-    stvx    v8,  0, r9
-    addi    r9, r9, 16
-
-    add     r3, r3, r4
-
-    bdnz    copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
-    ;# always read from the stack when doing a vertical filter
-    la      r9, 48(r1)
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v7, 7
-
-    vpre_load
-
-    luma_vsix
-    luma_vsix
-    luma_vfour
-
-end_16x16:
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-HFilter:
-    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
-    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
-    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
-    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
-    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
-    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
-    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
-    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
-    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
-    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
-    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-
-    .align 4
-VFilter:
-    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-
-    .align 4
-b_hperm:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-B_0123:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-B_4567:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-    .align 4
-B_89AB:
-    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
-    .align 4
-b_hilo:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-b_hilo_4x4:
-    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
deleted file mode 100644
index fd8aa665fdf..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/filter_bilinear_altivec.asm
+++ /dev/null
@@ -1,677 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl bilinear_predict4x4_ppc
-    .globl bilinear_predict8x4_ppc
-    .globl bilinear_predict8x8_ppc
-    .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r9, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r9, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r9, r0
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro HFilter V
-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_4x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_4x4_b:
-
-    stvx    v0, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v1, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_8x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_8x4_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-    hfilter_8 v4, 1
-    hfilter_8 v5, 1
-    hfilter_8 v6, 1
-    hfilter_8 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x8_b
-
-    hfilter_8 v8, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-    load_and_align_8  v5, 1
-    load_and_align_8  v6, 1
-    load_and_align_8  v7, 1
-    load_and_align_8  v8, 0
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-store_out_8x8_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  1
-    load_and_align_16  v1,  1
-    load_and_align_16  v2,  1
-    load_and_align_16  v3,  1
-    load_and_align_16  v4,  1
-    load_and_align_16  v5,  1
-    load_and_align_16  v6,  1
-    load_and_align_16  v7,  1
-    load_and_align_16  v8,  1
-    load_and_align_16  v9,  1
-    load_and_align_16  v10, 1
-    load_and_align_16  v11, 1
-    load_and_align_16  v12, 1
-    load_and_align_16  v13, 1
-    load_and_align_16  v14, 1
-    load_and_align_16  v15, 1
-    load_and_align_16  v16, 0
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-store_out_16x16_b:
-
-    write_16 v0,  1
-    write_16 v1,  1
-    write_16 v2,  1
-    write_16 v3,  1
-    write_16 v4,  1
-    write_16 v5,  1
-    write_16 v6,  1
-    write_16 v7,  1
-    write_16 v8,  1
-    write_16 v9,  1
-    write_16 v10, 1
-    write_16 v11, 1
-    write_16 v12, 1
-    write_16 v13, 1
-    write_16 v14, 1
-    write_16 v15, 0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/idctllm_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/idctllm_altivec.asm
deleted file mode 100644
index 117d9cfc8e8..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/idctllm_altivec.asm
+++ /dev/null
@@ -1,189 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-    .align 2
-short_idct4x4llm_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    load_c v8, sinpi8sqrt2, 0, r9, r10
-    load_c v9, cospi8sqrt2minus1, 0, r9, r10
-    load_c v10, hi_hi, 0, r9, r10
-    load_c v11, lo_lo, 0, r9, r10
-    load_c v12, shift_16, 0, r9, r10
-
-    li      r10,  16
-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
-    lvx     v1, r10, r3         ;# input ip[8], ip[12]
-
-    ;# first pass
-    vupkhsh v2, v0
-    vupkhsh v3, v1
-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
-
-    vupklsh v0, v0
-    vmulosh v4, v0, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vupklsh v1, v1
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v3, v1, v8
-    vsraw   v3, v3, v12
-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v0, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v0
-
-    vaddsws v3, v3, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    ;# transpose input
-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
-
-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
-
-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
-
-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
-
-    ;# second pass
-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
-
-    vmulosh v4, v1, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v3, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v3
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v2, v3, v8
-    vsraw   v2, v2, v12
-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vaddsws v3, v2, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    vspltish v6, 4
-    vspltish v7, 3
-
-    vpkswss v0, v0, v1
-    vpkswss v1, v2, v3
-
-    vaddshs v0, v0, v6
-    vaddshs v1, v1, v6
-
-    vsrah   v0, v0, v7
-    vsrah   v1, v1, v7
-
-    ;# transpose output
-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
-
-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    stvx    v0,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    add     r4, r4, r5
-
-    stvx    v1,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 4
-sinpi8sqrt2:
-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
-    .align 4
-cospi8sqrt2minus1:
-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
-    .align 4
-shift_16:
-    .long      16,    16,    16,    16
-
-    .align 4
-hi_hi:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-lo_lo:
-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_altivec.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_altivec.c
deleted file mode 100644
index 71bf6e2d759..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_altivec.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
-    unsigned char *u,   // source pointer
-    unsigned char *v,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    // These should all be done at once with one call, instead of 3
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
deleted file mode 100644
index 61df4e97639..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ /dev/null
@@ -1,1253 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl mbloop_filter_horizontal_edge_y_ppc
-    .globl loop_filter_horizontal_edge_y_ppc
-    .globl mbloop_filter_vertical_edge_y_ppc
-    .globl loop_filter_vertical_edge_y_ppc
-
-    .globl mbloop_filter_horizontal_edge_uv_ppc
-    .globl loop_filter_horizontal_edge_uv_ppc
-    .globl mbloop_filter_vertical_edge_uv_ppc
-    .globl loop_filter_vertical_edge_uv_ppc
-
-    .globl loop_filter_simple_horizontal_edge_ppc
-    .globl loop_filter_simple_vertical_edge_ppc
-
-    .text
-;# We often need to perform transposes (and other transpose-like operations)
-;#   on matrices of data.  This is simplified by the fact that we usually
-;#   operate on hunks of data whose dimensions are powers of 2, or at least
-;#   divisible by highish powers of 2.
-;#
-;#   These operations can be very confusing.  They become more straightforward
-;#   when we think of them as permutations of address bits: Concatenate a
-;#   group of vector registers and think of it as occupying a block of
-;#   memory beginning at address zero.  The low four bits 0...3 of the
-;#   address then correspond to position within a register, the higher-order
-;#   address bits select the register.
-;#
-;#   Although register selection, at the code level, is arbitrary, things
-;#   are simpler if we use contiguous ranges of register numbers, simpler
-;#   still if the low-order bits of the register number correspond to
-;#   conceptual address bits.  We do this whenever reasonable.
-;#
-;#   A 16x16 transpose can then be thought of as an operation on
-;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
-;#   memory and the effect of a transpose is to interchange address bit
-;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
-;#   column, which is interchanged with the row addressed by bits 4..7.
-;#
-;#   The altivec merge instructions provide a rapid means of effecting
-;#   many of these transforms.  They operate at three widths (8,16,32).
-;#   Writing V(x) for vector register #x, paired merges permute address
-;#   indices as follows.
-;#
-;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
-;#
-;#      vmrghb  V( x),          V( y), V( y + (1<<s))
-;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
-;#
-;#      vmrghh  V( x),          V( y), V( y + (1<<s))
-;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
-;#
-;#      vmrghw  V( x),          V( y), V( y + (1<<s))
-;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   Unfortunately, there is no doubleword merge instruction.
-;#   The following sequence uses "vperm" is a substitute.
-;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;#   are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
-;#
-;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
-;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;#   Except for bits s and d, the other relationships between register
-;#   number (= high-order part of address) bits are at the disposal of
-;#   the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;#   edges together.  This requires a single 16x16 transpose, which, in
-;#   the above language, amounts to the following permutation of address
-;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
-;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;#   Except for the fact that the destination registers get written
-;#   before we are done referencing the old contents, the cyclic transform
-;#   is effected by
-;#
-;#      x = 0;  do {
-;#          vmrghb V(2x),   V(x), V(x+8);
-;#          vmrghb V(2x+1), V(x), V(x+8);
-;#      } while( ++x < 8);
-;#
-;#   For clarity, and because we can afford it, we do this transpose
-;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
-;#   leaving the final result in 16 .. 31, as the lower registers are
-;#   used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
-    vmrghb  \A, \X, \Y
-    vmrglb  \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
-    Tpair v16,v17,  v0,v8
-    Tpair v18,v19,  v1,v9
-    Tpair v20,v21,  v2,v10
-    Tpair v22,v23,  v3,v11
-    Tpair v24,v25,  v4,v12
-    Tpair v26,v27,  v5,v13
-    Tpair v28,v29,  v6,v14
-    Tpair v30,v31,  v7,v15
-.endm
-
-.macro t16_odd
-    Tpair v0,v1, v16,v24
-    Tpair v2,v3, v17,v25
-    Tpair v4,v5, v18,v26
-    Tpair v6,v7, v19,v27
-    Tpair v8,v9, v20,v28
-    Tpair v10,v11, v21,v29
-    Tpair v12,v13, v22,v30
-    Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
-    t16_odd
-    t16_even
-    t16_odd
-    t16_even
-.endm
-
-;# Vertical edge filtering requires transposes.  For the simple filter,
-;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;#  v0 =  0  1 ... 14 15
-;#  v1 = 16 17 ... 30 31
-;#  v2 = 32 33 ... 47 48
-;#  v3 = 49 50 ... 62 63
-;#
-;#  In frame-buffer memory, the layout is:
-;#
-;#     0  16  32  48
-;#     1  17  33  49
-;#     ...
-;#    15  31  47  63.
-;#
-;#  We begin by reading the data 32 bits at a time (using scalar operations)
-;#  into a temporary array, reading the rows of the array into vector registers,
-;#  with the following layout:
-;#
-;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
-;#  v1 =  1 17 33 49  5 21 ...                      45 61
-;#  v2 =  2 18 ...                                  46 62
-;#  v3 =  3 19 ...                                  47 63
-;#
-;#  From the "address-bit" perspective discussed above, we simply need to
-;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;#  In other words, we transpose each of the four 4x4 submatrices.
-;#
-;#  This transformation is its own inverse, and we need to perform it
-;#  again before writing the pixels back into the frame buffer.
-;#
-;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;#  defined above.  We think of both groups of 4 registers as having
-;#  "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
-    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
-
-    vmrghb  v4, v0, v1
-    vmrglb  v5, v0, v1
-    vmrghb  v6, v2, v3
-    vmrglb  v7, v2, v3
-
-    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
-
-    vmrghh  v0, v4, v6
-    vmrglh  v1, v4, v6
-    vmrghh  v2, v5, v7
-    vmrglh  v3, v5, v7
-
-    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
-
-    vmrghw  v4, v0, v1
-    vmrglw  v5, v0, v1
-    vmrghw  v6, v2, v3
-    vmrglw  v7, v2, v3
-
-    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
-
-    vperm   v0, v4, v6, \Vlo
-    vperm   v1, v4, v6, \Vhi
-    vperm   v2, v5, v7, \Vlo
-    vperm   v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;#   We read 8 columns of data, initially in the following pattern:
-;#
-;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
-;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
-;#  ...
-;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;#   and wish to convert to:
-;#
-;#  (0,0) ... (0,15)
-;#  (1,0) ... (1,15)
-;#  ...
-;#  (7,0) ... (7,15).
-;#
-;#  In "address bit" language, we wish to map
-;#
-;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
-;#
-;#  This can be accomplished by 4 iterations of the cyclic transform
-;#
-;#  I -> (I+1) mod 7;
-;#
-;#  each iteration can be realized by (d=0, s=2):
-;#
-;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
-;#
-;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
-;#  preserving v8 = sign converter.
-;#
-;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;#  result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
-    Tpair v10, v11,  v0, v4
-    Tpair v12, v13,  v1, v5
-    Tpair v14, v15,  v2, v6
-    Tpair v16, v17,  v3, v7
-.endm
-
-.macro t8x16_even
-    Tpair v0, v1,  v10, v14
-    Tpair v2, v3,  v11, v15
-    Tpair v4, v5,  v12, v16
-    Tpair v6, v7,  v13, v17
-.endm
-
-.macro transpose8x16_fwd
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-    t8x16_even
-.endm
-
-.macro transpose8x16_inv
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-.endm
-
-.macro Transpose16x16
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;#   into vector register Vreg.  Trashes r0
-.macro load_g Vreg, Gptr
-    lwz     r0, \Gptr
-    lvx     \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here.  if the answer is negative
-;# it will be clamped to 0.  orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
-    vsububs \RES, \A, \B
-    vsububs \TMP, \B, \A
-    vor     \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
-    vsububs \TMP, \A, \B
-    vmaxub  \RES, \RES, \TMP
-    vsububs \TMP, \B, \A
-    vmaxub  \RES, \RES, \TMP
-.endm
-
-.macro Masks
-    ;# build masks
-    ;# input is all 8 bit unsigned (0-255).  need to
-    ;# do abs(vala-valb) > limit.  but no need to compare each
-    ;# value to the limit.  find the max of the absolute differences
-    ;# and compare that to the limit.
-    ;# First hev
-    Abs     v14, v13, v2, v3    ;# |P1 - P0|
-    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
-
-    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
-
-    ;# Next limit
-    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
-    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
-    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
-    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
-
-    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
-
-    ;# flimit
-    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
-
-    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
-
-    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
-    ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
-    ;# build constants
-    lvx     \FL, 0, \RFL        ;# flimit
-    lvx     \LI, 0, \RLI        ;# limit
-    lvx     \TH, 0, \RTH        ;# thresh
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
-    ;# setup strides/pointers to be able to access
-    ;# all of the data
-    add     r5, r4, r4          ;# r5 = 2 * stride
-    sub     r6, r3, r5          ;# r6 -> 2 rows back
-    neg     r7, r4              ;# r7 = -stride
-
-    ;# load 16 pixels worth of data to work on
-    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
-    lvx     v0,  0, r0          ;# P3  (read only)
-    lvx     v1, r7, r6          ;# P2
-    lvx     v2,  0, r6          ;# P1
-    lvx     v3, r7, r3          ;# P0
-    lvx     v4,  0, r3          ;# Q0
-    lvx     v5, r4, r3          ;# Q1
-    lvx     v6, r5, r3          ;# Q2
-    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
-    lvx     v7, r4, r0          ;# Q3  (read only)
-.endm
-
-;# Expects
-;#  v10 == HEV
-;#  v13 == tmp
-;#  v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
-    vxor    \P1, \P1, v11       ;# SP1
-    vxor    \P0, \P0, v11       ;# SP0
-    vxor    \Q0, \Q0, v11       ;# SQ0
-    vxor    \Q1, \Q1, v11       ;# SQ1
-
-    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
-.if \HEV_PRESENT
-    vand    v13, v13, v10       ;# f &= hev
-.endif
-    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
-
-    vsrab   v13, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
-    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
-    Masks
-
-    ;# start the fitering here
-    vxor    v1, v1, v11         ;# SP2
-    vxor    v2, v2, v11         ;# SP1
-    vxor    v3, v3, v11         ;# SP0
-    vxor    v4, v4, v11         ;# SQ0
-    vxor    v5, v5, v11         ;# SQ1
-    vxor    v6, v6, v11         ;# SQ2
-
-    ;# add outer taps if we have high edge variance
-    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
-
-    vsubsbs v14, v4, v3         ;# SQ0-SP0
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-    vand    v15, v13, v10       ;# f2 = f & hev
-
-    ;# save bottom 3 bits so that we round one side +4 and the other +3
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
-
-    vsrab   v14, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
-    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
-
-    ;# only apply wider filter if not high edge variance
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vspltisb v9, 2
-    vnor    v8, v8, v8
-    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
-    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
-    vspltisb v8, 9
-
-    ;# roughly 1/7th difference across boundary
-    vspltish v10, 7
-    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v8, v13
-    vaddshs v14, v14, v9        ;# +=  63
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
-    vaddsbs v1, v1, v10
-
-    vxor    v6, v6, v11
-    vxor    v1, v1, v11
-
-    ;# roughly 2/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v8, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
-    vaddsbs v2, v2, v10
-
-    vxor    v5, v5, v11
-    vxor    v2, v2, v11
-
-    ;# roughly 3/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v12, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
-    vaddsbs v3, v3, v10
-
-    vxor    v4, v4, v11
-    vxor    v3, v3, v11
-.endm
-
-.macro SBFilter
-    Masks
-
-    common_adjust v3, v4, v2, v5, 1
-
-    ;# outer tap adjustments
-    vspltisb v8, 1
-
-    vaddubm v13, v13, v8        ;# f  += 1
-    vsrab   v13, v13, v8        ;# f >>= 1
-
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
-    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
-
-    vxor    v2, v2, v11
-    vxor    v3, v3, v11
-    vxor    v4, v4, v11
-    vxor    v5, v5, v11
-.endm
-
-    .align 2
-mbloop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    vp8_mbfilter
-
-    stvx     v1, r7, r6         ;# P2
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-    stvx     v6, r5, r3         ;# Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    SBFilter
-
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
-;#  So we can read in an entire mb aligned.  However if we want to filter the mb
-;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
-;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
-;#  of a waste.  So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;#  memory to align and order them up.  Then they are read in using the
-;#  vector register file.
-.macro RLVmb V, R
-    lwzux   r0, r3, r4
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r3, r4
-    stw     r0,12(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-.macro WLVmb V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 8(\R)
-    stw     r0,-4(r3)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-    sub     r3, r3, r4
-
-    RLVmb v0, r9
-    RLVmb v1, r9
-    RLVmb v2, r9
-    RLVmb v3, r9
-    RLVmb v4, r9
-    RLVmb v5, r9
-    RLVmb v6, r9
-    RLVmb v7, r9
-
-    transpose8x16_fwd
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    vp8_mbfilter
-
-    transpose8x16_inv
-
-    add r3, r3, r4
-    neg r4, r4
-
-    WLVmb v17, r9
-    WLVmb v16, r9
-    WLVmb v15, r9
-    WLVmb v14, r9
-    WLVmb v13, r9
-    WLVmb v12, r9
-    WLVmb v11, r9
-    WLVmb v10, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RL V, R, P
-    lvx     \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro WL V, R, P
-    stvx    \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
-                                ;# K = |P0-P1| already
-    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
-    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
-    vcmpgtub v10, v14, v0
-
-    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
-
-    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
-    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
-    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
-
-    vmaxub   v14, v14, v4       ;# M = max interior abs diff
-    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
-
-    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
-    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
-    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
-
-    ;# replace P1,Q1 w/signed versions
-    common_adjust \P0, \Q0, \P1, \Q1, 1
-
-    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
-    vsrab   v13, v13, v1
-    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
-    vsubsbs \Q1, \Q1, v13
-    vaddsbs \P1, \P1, v13
-
-    vxor    \P1, \P1, v11       ;# P1
-    vxor    \P0, \P0, v11       ;# P0
-    vxor    \Q0, \Q0, v11       ;# Q0
-    vxor    \Q1, \Q1, v11       ;# Q1
-.endm
-
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    addi    r9, r3, 0
-    RL      v16, r9, r4
-    RL      v17, r9, r4
-    RL      v18, r9, r4
-    RL      v19, r9, r4
-    RL      v20, r9, r4
-    RL      v21, r9, r4
-    RL      v22, r9, r4
-    RL      v23, r9, r4
-    RL      v24, r9, r4
-    RL      v25, r9, r4
-    RL      v26, r9, r4
-    RL      v27, r9, r4
-    RL      v28, r9, r4
-    RL      v29, r9, r4
-    RL      v30, r9, r4
-    lvx     v31, 0, r9
-
-    Transpose16x16
-
-    vspltisb v1, 1
-
-    build_constants r5, r6, r7, v3, v2, v0
-
-    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
-
-    Fil v16, v17, v18, v19,  v20, v21, v22, v23
-    Fil v20, v21, v22, v23,  v24, v25, v26, v27
-    Fil v24, v25, v26, v27,  v28, v29, v30, v31
-
-    Transpose16x16
-
-    addi    r9, r3, 0
-    WL      v16, r9, r4
-    WL      v17, r9, r4
-    WL      v18, r9, r4
-    WL      v19, r9, r4
-    WL      v20, r9, r4
-    WL      v21, r9, r4
-    WL      v22, r9, r4
-    WL      v23, r9, r4
-    WL      v24, r9, r4
-    WL      v25, r9, r4
-    WL      v26, r9, r4
-    WL      v27, r9, r4
-    WL      v28, r9, r4
-    WL      v29, r9, r4
-    WL      v30, r9, r4
-    stvx    v31, 0, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
-    andi.   r7, r3, 8       ;# row origin modulo 16
-    add     r7, r7, r7      ;# selects selectors
-    lis     r12, _chromaSelectors@ha
-    la      r0,  _chromaSelectors@l(r12)
-    lwzux   r0, r7, r0      ;# leave selector addr in r7
-
-    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
-    lvx     \U, \Offs, r3
-    lvx     \V, \Offs, r4
-    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
-    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
-    vperm   \V, \New, \V, \Vmask
-    stvx    \U, \Offs, r3           ;# Write to frame buffer
-    stvx    \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
-    neg     r9, r5          ;# r9 = -1 * stride
-    add     r8, r9, r9      ;# r8 = -2 * stride
-    add     r10, r5, r5     ;# r10 = 2 * stride
-
-    active_chroma_sel v12
-
-    ;# P3, Q3 are read-only; need not save addresses or sibling pels
-    add     r6, r8, r8      ;# r6 = -4 * stride
-    hread_uv v0, v14, v15, r6, v12
-    add     r6, r10, r5     ;# r6 =  3 * stride
-    hread_uv v7, v14, v15, r6, v12
-
-    ;# Others are read/write; save addresses and sibling pels
-
-    add     r6, r8, r9      ;# r6 = -3 * stride
-    hread_uv v1, v16, v17, r6,  v12
-    hread_uv v2, v18, v19, r8,  v12
-    hread_uv v3, v20, v21, r9,  v12
-    hread_uv v4, v22, v23, 0,   v12
-    hread_uv v5, v24, v25, r5,  v12
-    hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
-    load_g   \V, 4(r7)
-.endm
-
-.macro vresult_sel V
-    load_g   \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
-    uresult_sel v11
-    vresult_sel v12
-    hwrite_uv v2, v18, v19, r8, v11, v12
-    hwrite_uv v3, v20, v21, r9, v11, v12
-    hwrite_uv v4, v22, v23, 0,  v11, v12
-    hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    vp8_mbfilter
-
-    store_chroma_h
-
-    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
-    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    SBFilter
-
-    store_chroma_h
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro R V, R
-    lwzux   r0, r3, r5
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r4, r5
-    stw     r0,12(\R)
-    lwz     r0,-4(r4)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-
-.macro W V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r4, r5
-    lwz     r0, 8(\R)
-    stw     r0,-4(r4)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r5
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-.macro chroma_vread R
-    sub r3, r3, r5          ;# back up one line for simplicity
-    sub r4, r4, r5
-
-    R v0, \R
-    R v1, \R
-    R v2, \R
-    R v3, \R
-    R v4, \R
-    R v5, \R
-    R v6, \R
-    R v7, \R
-
-    transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
-    transpose8x16_inv
-
-    add     r3, r3, r5
-    add     r4, r4, r5
-    neg     r5, r5          ;# Write rows back in reverse order
-
-    W v17, \R
-    W v16, \R
-    W v15, \R
-    W v14, \R
-    W v13, \R
-    W v12, \R
-    W v11, \R
-    W v10, \R
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    vp8_mbfilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    SBFilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
-    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
-    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
-
-    ;# preserve unsigned v0 and v3
-    common_adjust v1, v2, v0, v3, 0
-
-    vxor v1, v1, v11
-    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
-    addi    r8,  0, 16
-    addi    r7, r5, 32
-
-    lvx     v0,  0, r5
-    lvx     v1, r8, r5
-    lvx     v2,  0, r7
-    lvx     v3, r8, r7
-
-    lis     r12, _B_hihi@ha
-    la      r0,  _B_hihi@l(r12)
-    lvx     v16, 0, r0
-
-    lis     r12, _B_lolo@ha
-    la      r0,  _B_lolo@l(r12)
-    lvx     v17, 0, r0
-
-    Transpose4times4x4 v16, v17
-    vp8_simple_filter
-
-    vxor v0, v0, v11
-    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
-
-    Transpose4times4x4 v16, v17
-
-    stvx    v0,  0, r5
-    stvx    v1, r8, r5
-    stvx    v2,  0, r7
-    stvx    v3, r8, r7
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    neg     r5, r4              ;# r5 = -1 * stride
-    add     r6, r5, r5          ;# r6 = -2 * stride
-
-    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
-    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
-    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
-    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
-
-    vp8_simple_filter
-
-    stvx    v1, r5, r3          ;# store P0
-    stvx    v2,  0, r3          ;# store Q0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RLV Offs
-    stw     r0, (\Offs*4)(r5)
-    lwzux   r0, r7, r4
-.endm
-
-.macro WLV Offs
-    lwz     r0, (\Offs*4)(r5)
-    stwux   r0, r7, r4
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    la r5, -96(r1)              ;# temporary space for reading in vectors
-
-    ;# Store 4 pels at word "Offs" in temp array, then advance r7
-    ;#   to next row and read another 4 pels from the frame buffer.
-
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r7          ;# read first 4 pels
-
-    ;# 16 unaligned word accesses
-    RLV 0
-    RLV 4
-    RLV 8
-    RLV 12
-    RLV 1
-    RLV 5
-    RLV 9
-    RLV 13
-    RLV 2
-    RLV 6
-    RLV 10
-    RLV 14
-    RLV 3
-    RLV 7
-    RLV 11
-
-    stw     r0, (15*4)(r5)      ;# write last 4 pels
-
-    simple_vertical
-
-    ;# Read temp array, write frame buffer.
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r5          ;# read/write first 4 pels
-    stwx    r0,  0, r7
-
-    WLV 4
-    WLV 8
-    WLV 12
-    WLV 1
-    WLV 5
-    WLV 9
-    WLV 13
-    WLV 2
-    WLV 6
-    WLV 10
-    WLV 14
-    WLV 3
-    WLV 7
-    WLV 11
-    WLV 15
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-_chromaSelectors:
-    .long   _B_hihi
-    .long   _B_Ures0
-    .long   _B_Vres0
-    .long   0
-    .long   _B_lolo
-    .long   _B_Ures8
-    .long   _B_Vres8
-    .long   0
-
-    .align 4
-_B_Vres8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
-
-    .align 4
-_B_Ures8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
-
-    .align 4
-_B_lolo:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_Vres0:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-    .align 4
-_B_Ures0:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_hihi:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/platform_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/platform_altivec.asm
deleted file mode 100644
index f81d86f740e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/platform_altivec.asm
+++ /dev/null
@@ -1,59 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl save_platform_context
-    .globl restore_platform_context
-
-.macro W V P
-    stvx    \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-.macro R V P
-    lvx     \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-;# r3 context_ptr
-    .align 2
-save_platform_contex:
-    W v20, r3
-    W v21, r3
-    W v22, r3
-    W v23, r3
-    W v24, r3
-    W v25, r3
-    W v26, r3
-    W v27, r3
-    W v28, r3
-    W v29, r3
-    W v30, r3
-    W v31, r3
-
-    blr
-
-;# r3 context_ptr
-    .align 2
-restore_platform_context:
-    R v20, r3
-    R v21, r3
-    R v22, r3
-    R v23, r3
-    R v24, r3
-    R v25, r3
-    R v26, r3
-    R v27, r3
-    R v28, r3
-    R v29, r3
-    R v30, r3
-    R v31, r3
-
-    blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/recon_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/recon_altivec.asm
deleted file mode 100644
index dd39e05a836..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/recon_altivec.asm
+++ /dev/null
@@ -1,175 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl recon4b_ppc
-    .globl recon2b_ppc
-    .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
-    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
-    addi    \Pred, \Pred, 16        ;# next pred
-    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff           ;# v3 = d0..d7
-    vaddshs v2, v2, v3              ;# v2 = r0..r7
-    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff           ;# v3 = d8..d15
-    addi    \Diff, \Diff, 32        ;# next diff
-    vaddshs v3, v3, v1              ;# v3 = r8..r15
-    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, \Dst            ;# to dst
-    add     \Dst, \Dst, \Stride     ;# next dst
-.endm
-
-    .text
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon4b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
-    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
-    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff       ;# v3 = d0..d7
-    vaddshs v2, v2, v3          ;# v2 = r0..r7
-    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff       ;# v2 = d8..d15
-    vaddshs v3, v3, v1          ;# v3 = r8..r15
-    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
-    stvx    v2,  0, r10         ;# 2 rows to dst from buf
-    lwz     r0, 0(r10)
-.if \write_first_four_pels
-    stw     r0, 0(\Dst)
-    .else
-    stwux   r0, \Dst, \Stride
-.endif
-    lwz     r0, 4(r10)
-    stw     r0, 4(\Dst)
-    lwz     r0, 8(r10)
-    stwux   r0, \Dst, \Stride       ;# advance dst to next row
-    lwz     r0, 12(r10)
-    stw     r0, 4(\Dst)
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-
-recon2b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    la      r10, -48(r1)                ;# buf
-
-    two_rows_of8 r3, r4, r5, r6, 1
-
-    addi    r4, r4, 16;                 ;# next pred
-    addi    r3, r3, 32;                 ;# next diff
-
-    two_rows_of8 r3, r4, r5, r6, 0
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro get_two_diff_rows
-    stw     r0, 0(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 4(r10)
-    lwzu    r0, 32(r3)
-    stw     r0, 8(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 12(r10)
-    lvx     v3, 0, r10
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon_b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-
-    la      r10, -48(r1)    ;# buf
-
-    lwz     r0, 0(r4)
-    stw     r0, 0(r10)
-    lwz     r0, 16(r4)
-    stw     r0, 4(r10)
-    lwz     r0, 32(r4)
-    stw     r0, 8(r10)
-    lwz     r0, 48(r4)
-    stw     r0, 12(r10)
-
-    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
-
-    lwz r0, 0(r3)           ;# v3 = d0..d7
-
-    get_two_diff_rows
-
-    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
-    vaddshs v2, v2, v3;     ;# v2 = r0..r7
-
-    lwzu r0, 32(r3)         ;# v3 = d8..d15
-
-    get_two_diff_rows
-
-    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
-    vaddshs v3, v3, v1;     ;# v3 = r8..r15
-
-    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
-
-    lwz     r0, 0(r10)
-    stw     r0, 0(r5)
-    lwz     r0, 4(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 8(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 12(r10)
-    stwx    r0, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/sad_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/sad_altivec.asm
deleted file mode 100644
index e5f26380f96..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/sad_altivec.asm
+++ /dev/null
@@ -1,277 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sad16x16_ppc
-    .globl vp8_sad16x8_ppc
-    .globl vp8_sad8x16_ppc
-    .globl vp8_sad8x8_ppc
-    .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v8, 0              ;# zero out total to start
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
-    lvsl    v3,  0, r5          ;# only needs to be done once per block
-
-    ;# preload a line of data before getting into the loop
-    lvx     v4, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    add     r5, r5, r6
-    add     r3, r3, r4
-
-    vperm   v5, v1, v2, v3
-
-    .align 4
-\loop_label:
-    ;# compute difference on first row
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-
-    ;# load up next set of data
-    lvx     v9, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    ;# perform abs() of difference
-    vor     v6, v6, v7
-    add     r3, r3, r4
-
-    ;# add to the running tally
-    vsum4ubs v8, v6, v8
-
-    ;# now onto the next line
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-    lvx     v4, 0, r3
-
-    ;# compute difference on second row
-    vsububs v6, v9, v5
-    lvx     v1,  0, r5
-    vsububs v7, v5, v9
-    lvx     v2, r10, r5
-    vor     v6, v6, v7
-    add     r3, r3, r4
-    vsum4ubs v8, v6, v8
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
-    .align 4
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v7, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v7
-
-    SAD_16
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_16_loop sad16x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_16_loop sad16x8_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_8_loop sad8x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_8_loop sad8x8_loop
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r7, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r7,  4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    vspltisw v8, 0              ;# zero out total to start
-
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v7, v6, v8
-    vsumsws v7, v7, v8
-
-    stvx    v7, 0, r1
-    lwz     r3, 12(r1)
-
-    epilogue
-
-    blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/systemdependent.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/systemdependent.c
deleted file mode 100644
index 6899c0e71cb..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/systemdependent.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "onyxc_int.h"
-
-extern void (*vp8_post_proc_down_and_across_mb_row)(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int cols,
-    unsigned char *f,
-    int size
-);
-
-extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-
-extern void vp8_post_proc_down_and_across_mb_row_c
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int cols,
-    unsigned char *f,
-    int size
-);
-void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
-
-extern copy_mem_block_function *vp8_copy_mem16x16;
-extern copy_mem_block_function *vp8_copy_mem8x8;
-extern copy_mem_block_function *vp8_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp8_sixtap_predict_c;
-extern subpixel_predict_function vp8_sixtap_predict8x4_c;
-extern subpixel_predict_function vp8_sixtap_predict8x8_c;
-extern subpixel_predict_function vp8_sixtap_predict16x16_c;
-extern subpixel_predict_function vp8_bilinear_predict4x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x8_c;
-extern subpixel_predict_function vp8_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp8_copy_mem16x16_c;
-extern copy_mem_block_function vp8_copy_mem8x8_c;
-extern copy_mem_block_function vp8_copy_mem8x4_c;
-
-void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp8_loop_filter_mbv_c;
-extern loop_filter_block_function vp8_loop_filter_bv_c;
-extern loop_filter_block_function vp8_loop_filter_mbh_c;
-extern loop_filter_block_function vp8_loop_filter_bh_c;
-
-extern loop_filter_block_function vp8_loop_filter_mbvs_c;
-extern loop_filter_block_function vp8_loop_filter_bvs_c;
-extern loop_filter_block_function vp8_loop_filter_mbhs_c;
-extern loop_filter_block_function vp8_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp8_clear_c(void)
-{
-}
-
-void vp8_machine_specific_config(void)
-{
-    // Pure C:
-    vp8_clear_system_state                = vp8_clear_c;
-    vp8_recon_b                          = vp8_recon_b_c;
-    vp8_recon4b                         = vp8_recon4b_c;
-    vp8_recon2b                         = vp8_recon2b_c;
-
-    vp8_bilinear_predict16x16            = bilinear_predict16x16_ppc;
-    vp8_bilinear_predict8x8              = bilinear_predict8x8_ppc;
-    vp8_bilinear_predict8x4              = bilinear_predict8x4_ppc;
-    vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
-
-    vp8_sixtap_predict16x16              = sixtap_predict16x16_ppc;
-    vp8_sixtap_predict8x8                = sixtap_predict8x8_ppc;
-    vp8_sixtap_predict8x4                = sixtap_predict8x4_ppc;
-    vp8_sixtap_predict                   = sixtap_predict_ppc;
-
-    vp8_short_idct4x4_1                  = vp8_short_idct4x4llm_1_c;
-    vp8_short_idct4x4                    = short_idct4x4llm_ppc;
-    vp8_dc_only_idct                      = vp8_dc_only_idct_c;
-
-    vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
-    vp8_lf_bvfull                        = loop_filter_bv_ppc;
-    vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
-    vp8_lf_bhfull                        = loop_filter_bh_ppc;
-
-    vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
-    vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
-    vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
-    vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
-
-    vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
-    vp8_mbpost_proc_down                  = vp8_mbpost_proc_down_c;
-    vp8_mbpost_proc_across_ip              = vp8_mbpost_proc_across_ip_c;
-    vp8_plane_add_noise                   = vp8_plane_add_noise_c;
-
-    vp8_copy_mem16x16                    = copy_mem16x16_ppc;
-    vp8_copy_mem8x8                      = vp8_copy_mem8x8_c;
-    vp8_copy_mem8x4                      = vp8_copy_mem8x4_c;
-
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_altivec.asm
deleted file mode 100644
index fb8d5bb1d9c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_altivec.asm
+++ /dev/null
@@ -1,375 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_get8x8var_ppc
-    .globl vp8_get16x16var_ppc
-    .globl vp8_mse16x16_ppc
-    .globl vp8_variance16x16_ppc
-    .globl vp8_variance16x8_ppc
-    .globl vp8_variance8x16_ppc
-    .globl vp8_variance8x8_ppc
-    .globl vp8_variance4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v7, 0              ;# zero for merging
-    vspltisw v8, 0              ;# zero out total to start
-    vspltisw v9, 0              ;# zero out total for dif^2
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  v2, v7, v4
-    vmrghb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    vmrglb  v2, v7, v4
-    vmrglb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v0, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v0
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, get8x8var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, get16x16var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
-    prologue
-
-    mtctr   r10
-
-mse16x16_loop:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-
-    bdnz    mse16x16_loop
-
-    vsumsws v9, v9, v7
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stw     r3, 0(r7)           ;# sse
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x16_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, variance16x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x8_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_16 7, variance16x8_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_8 7, variance8x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, variance8x8_loop, 0
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r10,0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r10, 4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    compute_sum_sse
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
-
-    epilogue
-
-    blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
deleted file mode 100644
index 2308373a1d2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppc/variance_subpixel_altivec.asm
+++ /dev/null
@@ -1,865 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sub_pixel_variance4x4_ppc
-    .globl vp8_sub_pixel_variance8x8_ppc
-    .globl vp8_sub_pixel_variance8x16_ppc
-    .globl vp8_sub_pixel_variance16x8_ppc
-    .globl vp8_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r12, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r12, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r12, r0
-
-    ;# index to the next set of vectors in the row.
-    li      r12, 32
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  \t1, \z0, \src
-    vmrghb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    vmrglb  \t1, \z0, \src
-    vmrglb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    ;# Now compute sse.
-    vsububs \t1, \src, \ref
-    vsububs \t2, \ref, \src
-    vor     \t1, \t1, \t2
-
-    vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
-    vsumsws \sum, \sum, \z0
-    vsumsws \sse, \sse, \z0
-
-    stvx    \sum, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    \sse, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r9)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
-    load_and_align_16  v16, r7, r8, \increment_counter
-    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
-    lvsl    v17,  0, \R         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, \R
-    lvx     v22, r10, \R
-
-.if \increment_counter
-    add     \R, \R, \P
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_4x4_b
-
-    hfilter_8 v4, v10, v11, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-compute_sum_sse_4x4_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    load_and_align_16 v4, r7, r8, 1
-    load_and_align_16 v5, r7, r8, 1
-    load_and_align_16 v6, r7, r8, 1
-    load_and_align_16 v7, r7, r8, 1
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_c v10, b_hilo_b, 0, r12, r0
-
-    vperm   v0, v0, v1, v10
-    vperm   v1, v2, v3, v10
-
-    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 4
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-    hfilter_8 v4, v10, v11, 1
-    hfilter_8 v5, v10, v11, 1
-    hfilter_8 v6, v10, v11, 1
-    hfilter_8 v7, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x8_b
-
-    hfilter_8 v8, v10, v11, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 1
-    load_and_align_16 v5, r3, r4, 1
-    load_and_align_16 v6, r3, r4, 1
-    load_and_align_16 v7, r3, r4, 1
-    load_and_align_16 v8, r3, r4, 0
-
-    beq     compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0, v1
-    vfilter_16 v1, v2
-    vfilter_16 v2, v3
-    vfilter_16 v3, v4
-    vfilter_16 v4, v5
-    vfilter_16 v5, v6
-    vfilter_16 v6, v7
-    vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_and_align_16 v4,  r7, r8, 1
-    load_and_align_16 v5,  r7, r8, 1
-    load_and_align_16 v6,  r7, r8, 1
-    load_and_align_16 v7,  r7, r8, 1
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 0
-
-    vmrghb  v4, v4,  v5
-    vmrghb  v5, v6,  v7
-    vmrghb  v6, v8,  v9
-    vmrghb  v7, v10, v11
-
-    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
-    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
-    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 6
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x16_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v29, b_0123_b, 0, r12, r0
-    load_c v30, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0,  v29, v30, 1
-    hfilter_8 v1,  v29, v30, 1
-    hfilter_8 v2,  v29, v30, 1
-    hfilter_8 v3,  v29, v30, 1
-    hfilter_8 v4,  v29, v30, 1
-    hfilter_8 v5,  v29, v30, 1
-    hfilter_8 v6,  v29, v30, 1
-    hfilter_8 v7,  v29, v30, 1
-    hfilter_8 v8,  v29, v30, 1
-    hfilter_8 v9,  v29, v30, 1
-    hfilter_8 v10, v29, v30, 1
-    hfilter_8 v11, v29, v30, 1
-    hfilter_8 v12, v29, v30, 1
-    hfilter_8 v13, v29, v30, 1
-    hfilter_8 v14, v29, v30, 1
-    hfilter_8 v15, v29, v30, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x16_b
-
-    hfilter_8 v16, v29, v30, 0
-
-    b   second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0,  r3, r4, 1
-    load_and_align_16 v1,  r3, r4, 1
-    load_and_align_16 v2,  r3, r4, 1
-    load_and_align_16 v3,  r3, r4, 1
-    load_and_align_16 v4,  r3, r4, 1
-    load_and_align_16 v5,  r3, r4, 1
-    load_and_align_16 v6,  r3, r4, 1
-    load_and_align_16 v7,  r3, r4, 1
-    load_and_align_16 v8,  r3, r4, 1
-    load_and_align_16 v9,  r3, r4, 1
-    load_and_align_16 v10, r3, r4, 1
-    load_and_align_16 v11, r3, r4, 1
-    load_and_align_16 v12, r3, r4, 1
-    load_and_align_16 v13, r3, r4, 1
-    load_and_align_16 v14, r3, r4, 1
-    load_and_align_16 v15, r3, r4, 1
-    load_and_align_16 v16, r3, r4, 0
-
-    beq     compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0,  v1
-    vmrghb  v1, v2,  v3
-    vmrghb  v2, v4,  v5
-    vmrghb  v3, v6,  v7
-    vmrghb  v4, v8,  v9
-    vmrghb  v5, v10, v11
-    vmrghb  v6, v12, v13
-    vmrghb  v7, v14, v15
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 1
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 0
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x8_pre_copy_b
-
-    hfilter_16 v0, 1
-    hfilter_16 v1, 1
-    hfilter_16 v2, 1
-    hfilter_16 v3, 1
-    hfilter_16 v4, 1
-    hfilter_16 v5, 1
-    hfilter_16 v6, 1
-    hfilter_16 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x8_b
-
-    hfilter_16 v8, 0
-
-    b   second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-
-    beq     compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-compute_sum_sse_16x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0, 1
-    compute_sum_sse_16 v1, 1
-    compute_sum_sse_16 v2, 1
-    compute_sum_sse_16 v3, 1
-    compute_sum_sse_16 v4, 1
-    compute_sum_sse_16 v5, 1
-    compute_sum_sse_16 v6, 1
-    compute_sum_sse_16 v7, 0
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-    load_and_align_16  v9,  r3, r4, 1
-    load_and_align_16  v10, r3, r4, 1
-    load_and_align_16  v11, r3, r4, 1
-    load_and_align_16  v12, r3, r4, 1
-    load_and_align_16  v13, r3, r4, 1
-    load_and_align_16  v14, r3, r4, 1
-    load_and_align_16  v15, r3, r4, 1
-    load_and_align_16  v16, r3, r4, 0
-
-    beq     compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0,  1
-    compute_sum_sse_16 v1,  1
-    compute_sum_sse_16 v2,  1
-    compute_sum_sse_16 v3,  1
-    compute_sum_sse_16 v4,  1
-    compute_sum_sse_16 v5,  1
-    compute_sum_sse_16 v6,  1
-    compute_sum_sse_16 v7,  1
-    compute_sum_sse_16 v8,  1
-    compute_sum_sse_16 v9,  1
-    compute_sum_sse_16 v10, 1
-    compute_sum_sse_16 v11, 1
-    compute_sum_sse_16 v12, 1
-    compute_sum_sse_16 v13, 1
-    compute_sum_sse_16 v14, 1
-    compute_sum_sse_16 v15, 0
-
-    variance_final v18, v19, v23, 8
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c
index bac3c9474ee..e3025955871 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconinter.c
@@ -10,6 +10,8 @@
 
 
 #include <limits.h>
+#include <string.h>
+
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -30,31 +32,8 @@ void vp8_copy_mem16x16_c(
 
     for (r = 0; r < 16; r++)
     {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
-        dst[3] = src[3];
-        dst[4] = src[4];
-        dst[5] = src[5];
-        dst[6] = src[6];
-        dst[7] = src[7];
-        dst[8] = src[8];
-        dst[9] = src[9];
-        dst[10] = src[10];
-        dst[11] = src[11];
-        dst[12] = src[12];
-        dst[13] = src[13];
-        dst[14] = src[14];
-        dst[15] = src[15];
-
-#else
-        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
-        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-        ((uint32_t *)dst)[2] = ((uint32_t *)src)[2] ;
-        ((uint32_t *)dst)[3] = ((uint32_t *)src)[3] ;
+        memcpy(dst, src, 16);
 
-#endif
         src += src_stride;
         dst += dst_stride;
 
@@ -72,19 +51,8 @@ void vp8_copy_mem8x8_c(
 
     for (r = 0; r < 8; r++)
     {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
-        dst[3] = src[3];
-        dst[4] = src[4];
-        dst[5] = src[5];
-        dst[6] = src[6];
-        dst[7] = src[7];
-#else
-        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
-        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-#endif
+        memcpy(dst, src, 8);
+
         src += src_stride;
         dst += dst_stride;
 
@@ -102,19 +70,8 @@ void vp8_copy_mem8x4_c(
 
     for (r = 0; r < 4; r++)
     {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst[0] = src[0];
-        dst[1] = src[1];
-        dst[2] = src[2];
-        dst[3] = src[3];
-        dst[4] = src[4];
-        dst[5] = src[5];
-        dst[6] = src[6];
-        dst[7] = src[7];
-#else
-        ((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
-        ((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
-#endif
+        memcpy(dst, src, 8);
+
         src += src_stride;
         dst += dst_stride;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c
index ec51ffe40d9..0a6c51b3531 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/reconintra.c
@@ -70,10 +70,10 @@ void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
             expected_dc = 128;
         }
 
-        /*vpx_memset(ypred_ptr, expected_dc, 256);*/
+        /*memset(ypred_ptr, expected_dc, 256);*/
         for (r = 0; r < 16; r++)
         {
-            vpx_memset(ypred_ptr, expected_dc, 16);
+            memset(ypred_ptr, expected_dc, 16);
             ypred_ptr += y_stride;
         }
     }
@@ -98,7 +98,7 @@ void vp8_build_intra_predictors_mby_s_c(MACROBLOCKD *x,
         for (r = 0; r < 16; r++)
         {
 
-            vpx_memset(ypred_ptr, yleft_col[r], 16);
+            memset(ypred_ptr, yleft_col[r], 16);
             ypred_ptr += y_stride;
         }
 
@@ -202,12 +202,12 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
         }
 
 
-        /*vpx_memset(upred_ptr,expected_udc,64);*/
-        /*vpx_memset(vpred_ptr,expected_vdc,64);*/
+        /*memset(upred_ptr,expected_udc,64);*/
+        /*memset(vpred_ptr,expected_vdc,64);*/
         for (i = 0; i < 8; i++)
         {
-            vpx_memset(upred_ptr, expected_udc, 8);
-            vpx_memset(vpred_ptr, expected_vdc, 8);
+            memset(upred_ptr, expected_udc, 8);
+            memset(vpred_ptr, expected_vdc, 8);
             upred_ptr += pred_stride;
             vpred_ptr += pred_stride;
         }
@@ -217,8 +217,8 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
     {
         for (i = 0; i < 8; i++)
         {
-            vpx_memcpy(upred_ptr, uabove_row, 8);
-            vpx_memcpy(vpred_ptr, vabove_row, 8);
+            memcpy(upred_ptr, uabove_row, 8);
+            memcpy(vpred_ptr, vabove_row, 8);
             upred_ptr += pred_stride;
             vpred_ptr += pred_stride;
         }
@@ -229,8 +229,8 @@ void vp8_build_intra_predictors_mbuv_s_c(MACROBLOCKD *x,
     {
         for (i = 0; i < 8; i++)
         {
-            vpx_memset(upred_ptr, uleft_col[i], 8);
-            vpx_memset(vpred_ptr, vleft_col[i], 8);
+            memset(upred_ptr, uleft_col[i], 8);
+            memset(vpred_ptr, vleft_col[i], 8);
             upred_ptr += pred_stride;
             vpred_ptr += pred_stride;
         }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c
index 0b371b094aa..ab0e9b47fe8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd.c
@@ -7,15 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-extern void vpx_scale_rtcd(void);
 
 void vp8_rtcd()
 {
-    vpx_scale_rtcd();
     once(setup_rtcd_internal);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
index c73ecf93f15..56b7db7ec33 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl
@@ -304,88 +304,6 @@ $vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
 $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
 
 #
-# Single block SAD
-#
-add_proto qw/unsigned int vp8_sad4x4/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad4x4 mmx sse2 neon/;
-$vp8_sad4x4_sse2=vp8_sad4x4_wmt;
-
-add_proto qw/unsigned int vp8_sad8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad8x8 mmx sse2 neon/;
-$vp8_sad8x8_sse2=vp8_sad8x8_wmt;
-
-add_proto qw/unsigned int vp8_sad8x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad8x16 mmx sse2 neon/;
-$vp8_sad8x16_sse2=vp8_sad8x16_wmt;
-
-add_proto qw/unsigned int vp8_sad16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad16x8 mmx sse2 neon/;
-$vp8_sad16x8_sse2=vp8_sad16x8_wmt;
-
-add_proto qw/unsigned int vp8_sad16x16/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int max_sad";
-specialize qw/vp8_sad16x16 mmx sse2 sse3 media neon/;
-$vp8_sad16x16_sse2=vp8_sad16x16_wmt;
-$vp8_sad16x16_media=vp8_sad16x16_armv6;
-
-#
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-add_proto qw/void vp8_sad4x4x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad4x4x3 sse3/;
-
-add_proto qw/void vp8_sad8x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x8x3 sse3/;
-
-add_proto qw/void vp8_sad8x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x16x3 sse3/;
-
-add_proto qw/void vp8_sad16x8x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x8x3 sse3 ssse3/;
-
-add_proto qw/void vp8_sad16x16x3/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x16x3 sse3 ssse3/;
-
-# Note the only difference in the following prototypes is that they return into
-# an array of short
-add_proto qw/void vp8_sad4x4x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad4x4x8 sse4_1/;
-$vp8_sad4x4x8_sse4_1=vp8_sad4x4x8_sse4;
-
-add_proto qw/void vp8_sad8x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad8x8x8 sse4_1/;
-$vp8_sad8x8x8_sse4_1=vp8_sad8x8x8_sse4;
-
-add_proto qw/void vp8_sad8x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad8x16x8 sse4_1/;
-$vp8_sad8x16x8_sse4_1=vp8_sad8x16x8_sse4;
-
-add_proto qw/void vp8_sad16x8x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad16x8x8 sse4_1/;
-$vp8_sad16x8x8_sse4_1=vp8_sad16x8x8_sse4;
-
-add_proto qw/void vp8_sad16x16x8/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array";
-specialize qw/vp8_sad16x16x8 sse4_1/;
-$vp8_sad16x16x8_sse4_1=vp8_sad16x16x8_sse4;
-
-#
-# Multi-block SAD, comparing a reference to N independent blocks
-#
-add_proto qw/void vp8_sad4x4x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad4x4x4d sse3/;
-
-add_proto qw/void vp8_sad8x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x8x4d sse3/;
-
-add_proto qw/void vp8_sad8x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad8x16x4d sse3/;
-
-add_proto qw/void vp8_sad16x8x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x8x4d sse3/;
-
-add_proto qw/void vp8_sad16x16x4d/, "const unsigned char *src_ptr, int src_stride, const unsigned char * const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp8_sad16x16x4d sse3/;
-
-#
 # Encoder functions below this point.
 #
 if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
@@ -454,25 +372,7 @@ add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_regular_quantize_b sse2 sse4_1/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/;
-$vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
-$vp8_fast_quantize_b_neon_asm=vp8_fast_quantize_b_neon;
-
-add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-# no asm yet
-
-add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-specialize qw/vp8_fast_quantize_b_pair neon_asm/;
-$vp8_fast_quantize_b_pair_neon_asm=vp8_fast_quantize_b_pair_neon;
-
-add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
-specialize qw/vp8_quantize_mb neon/;
-
-add_proto qw/void vp8_quantize_mby/, "struct macroblock *";
-specialize qw/vp8_quantize_mby neon/;
-
-add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *";
-specialize qw/vp8_quantize_mbuv neon/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon/;
 
 #
 # Block subtraction
@@ -490,16 +390,13 @@ specialize qw/vp8_mbuverror mmx sse2/;
 $vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 
 add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 media neon/;
-$vp8_subtract_b_media=vp8_subtract_b_armv6;
+specialize qw/vp8_subtract_b mmx sse2 neon/;
 
 add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 media neon/;
-$vp8_subtract_mby_media=vp8_subtract_mby_armv6;
+specialize qw/vp8_subtract_mby mmx sse2 neon/;
 
 add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 media neon/;
-$vp8_subtract_mbuv_media=vp8_subtract_mbuv_armv6;
+specialize qw/vp8_subtract_mbuv mmx sse2 neon/;
 
 #
 # Motion search
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/sad_c.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/sad_c.c
deleted file mode 100644
index 5f36fc96e86..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/sad_c.c
+++ /dev/null
@@ -1,302 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <limits.h>
-#include <stdlib.h>
-#include "vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-static unsigned int sad_mx_n_c(const unsigned char *src_ptr, int src_stride,
-                               const unsigned char *ref_ptr, int ref_stride,
-                               unsigned int max_sad, int m, int n)
-{
-    int r, c;
-    unsigned int sad = 0;
-
-    for (r = 0; r < n; r++)
-    {
-        for (c = 0; c < m; c++)
-        {
-            sad += abs(src_ptr[c] - ref_ptr[c]);
-        }
-
-        if (sad > max_sad)
-          break;
-
-        src_ptr += src_stride;
-        ref_ptr += ref_stride;
-    }
-
-    return sad;
-}
-
-/* max_sad is provided as an optional optimization point. Alternative
- * implementations of these functions are not required to check it.
- */
-
-unsigned int vp8_sad16x16_c(const unsigned char *src_ptr, int src_stride,
-                            const unsigned char *ref_ptr, int ref_stride,
-                            unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 16);
-}
-
-unsigned int vp8_sad8x8_c(const unsigned char *src_ptr, int src_stride,
-                          const unsigned char *ref_ptr, int ref_stride,
-                          unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 8);
-}
-
-unsigned int vp8_sad16x8_c(const unsigned char *src_ptr, int src_stride,
-                           const unsigned char *ref_ptr, int ref_stride,
-                           unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 16, 8);
-
-}
-
-unsigned int vp8_sad8x16_c(const unsigned char *src_ptr, int src_stride,
-                           const unsigned char *ref_ptr, int ref_stride,
-                           unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 8, 16);
-}
-
-unsigned int vp8_sad4x4_c(const unsigned char *src_ptr, int src_stride,
-                          const unsigned char *ref_ptr, int ref_stride,
-                          unsigned int max_sad)
-{
-    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, max_sad, 4, 4);
-}
-
-void vp8_sad16x16x3_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char *ref_ptr, int ref_stride,
-                      unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x16x8_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char *ref_ptr, int ref_stride,
-                      unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x3_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x8_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x3_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x8_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x3_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x8_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char *ref_ptr, int ref_stride,
-                     unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x3_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x8_c(const unsigned char *src_ptr, int src_stride,
-                    const unsigned char *ref_ptr, int ref_stride,
-                    unsigned short *sad_array)
-{
-    sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 0, ref_stride, UINT_MAX);
-    sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, UINT_MAX);
-    sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, UINT_MAX);
-    sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3, ref_stride, UINT_MAX);
-    sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, UINT_MAX);
-    sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, UINT_MAX);
-    sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6, ref_stride, UINT_MAX);
-    sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x16x4d_c(const unsigned char *src_ptr, int src_stride,
-                       const unsigned char * const ref_ptr[], int ref_stride,
-                       unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad16x8x4d_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char * const ref_ptr[], int ref_stride,
-                      unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x8x4d_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char * const ref_ptr[], int ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad8x16x4d_c(const unsigned char *src_ptr, int src_stride,
-                      const unsigned char * const ref_ptr[], int ref_stride,
-                      unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-void vp8_sad4x4x4d_c(const unsigned char *src_ptr, int src_stride,
-                     const unsigned char * const ref_ptr[], int  ref_stride,
-                     unsigned int *sad_array)
-{
-    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, UINT_MAX);
-    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, UINT_MAX);
-    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, UINT_MAX);
-    sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, UINT_MAX);
-}
-
-/* Copy 2 macroblocks to a buffer */
-void vp8_copy32xn_c(unsigned char *src_ptr, int src_stride,
-                    unsigned char *dst_ptr, int dst_stride,
-                    int height)
-{
-    int r;
-
-    for (r = 0; r < height; r++)
-    {
-#if !(CONFIG_FAST_UNALIGNED)
-        dst_ptr[0] = src_ptr[0];
-        dst_ptr[1] = src_ptr[1];
-        dst_ptr[2] = src_ptr[2];
-        dst_ptr[3] = src_ptr[3];
-        dst_ptr[4] = src_ptr[4];
-        dst_ptr[5] = src_ptr[5];
-        dst_ptr[6] = src_ptr[6];
-        dst_ptr[7] = src_ptr[7];
-        dst_ptr[8] = src_ptr[8];
-        dst_ptr[9] = src_ptr[9];
-        dst_ptr[10] = src_ptr[10];
-        dst_ptr[11] = src_ptr[11];
-        dst_ptr[12] = src_ptr[12];
-        dst_ptr[13] = src_ptr[13];
-        dst_ptr[14] = src_ptr[14];
-        dst_ptr[15] = src_ptr[15];
-        dst_ptr[16] = src_ptr[16];
-        dst_ptr[17] = src_ptr[17];
-        dst_ptr[18] = src_ptr[18];
-        dst_ptr[19] = src_ptr[19];
-        dst_ptr[20] = src_ptr[20];
-        dst_ptr[21] = src_ptr[21];
-        dst_ptr[22] = src_ptr[22];
-        dst_ptr[23] = src_ptr[23];
-        dst_ptr[24] = src_ptr[24];
-        dst_ptr[25] = src_ptr[25];
-        dst_ptr[26] = src_ptr[26];
-        dst_ptr[27] = src_ptr[27];
-        dst_ptr[28] = src_ptr[28];
-        dst_ptr[29] = src_ptr[29];
-        dst_ptr[30] = src_ptr[30];
-        dst_ptr[31] = src_ptr[31];
-#else
-        ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ;
-        ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ;
-        ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ;
-        ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ;
-        ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ;
-        ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ;
-        ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ;
-        ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ;
-#endif
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-
-    }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c
index 60afe519f56..669564db42b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/setupintrarecon.c
@@ -17,15 +17,15 @@ void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
     int i;
 
     /* set up frame new frame for intra coded blocks */
-    vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
     for (i = 0; i < ybf->y_height; i++)
         ybf->y_buffer[ybf->y_stride *i - 1] = (unsigned char) 129;
 
-    vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
     for (i = 0; i < ybf->uv_height; i++)
         ybf->u_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
 
-    vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
     for (i = 0; i < ybf->uv_height; i++)
         ybf->v_buffer[ybf->uv_stride *i - 1] = (unsigned char) 129;
 
@@ -33,7 +33,7 @@ void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf)
 
 void vp8_setup_intra_recon_top_line(YV12_BUFFER_CONFIG *ybf)
 {
-    vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
-    vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
-    vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5);
+    memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
+    memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
index 89a32a72268..552a28025e6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/variance.h
@@ -14,16 +14,17 @@
 
 #include "vpx_config.h"
 
+#include "vpx/vpx_integer.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-typedef unsigned int(*vp8_sad_fn_t)(
-    const unsigned char *src_ptr,
+typedef unsigned int(*vpx_sad_fn_t)(
+    const uint8_t *src_ptr,
     int source_stride,
-    const unsigned char *ref_ptr,
-    int ref_stride,
-    unsigned int max_sad);
+    const uint8_t *ref_ptr,
+    int ref_stride);
 
 typedef void (*vp8_copy32xn_fn_t)(
     const unsigned char *src_ptr,
@@ -32,27 +33,17 @@ typedef void (*vp8_copy32xn_fn_t)(
     int ref_stride,
     int n);
 
-typedef void (*vp8_sad_multi_fn_t)(
+typedef void (*vpx_sad_multi_fn_t)(
     const unsigned char *src_ptr,
     int source_stride,
-    const unsigned char *ref_ptr,
+    const unsigned char *ref_array,
     int  ref_stride,
     unsigned int *sad_array);
-
-typedef void (*vp8_sad_multi1_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride,
-     unsigned short *sad_array
-    );
-
-typedef void (*vp8_sad_multi_d_fn_t)
+typedef void (*vpx_sad_multi_d_fn_t)
     (
      const unsigned char *src_ptr,
      int source_stride,
-     const unsigned char * const ref_ptr[],
+     const unsigned char * const ref_array[],
      int  ref_stride,
      unsigned int *sad_array
     );
@@ -102,15 +93,15 @@ typedef unsigned int (*vp8_get16x16prederror_fn_t)
 
 typedef struct variance_vtable
 {
-    vp8_sad_fn_t            sdf;
+    vpx_sad_fn_t            sdf;
     vp8_variance_fn_t       vf;
     vp8_subpixvariance_fn_t svf;
     vp8_variance_fn_t       svf_halfpix_h;
     vp8_variance_fn_t       svf_halfpix_v;
     vp8_variance_fn_t       svf_halfpix_hv;
-    vp8_sad_multi_fn_t      sdx3f;
-    vp8_sad_multi1_fn_t     sdx8f;
-    vp8_sad_multi_d_fn_t    sdx4df;
+    vpx_sad_multi_fn_t      sdx3f;
+    vpx_sad_multi_fn_t      sdx8f;
+    vpx_sad_multi_d_fn_t    sdx4df;
 #if ARCH_X86 || ARCH_X86_64
     vp8_copy32xn_fn_t       copymem;
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse2.asm
new file mode 100644
index 00000000000..86fae269563
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse2.asm
@@ -0,0 +1,93 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;void vp8_copy32xn_sse2(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse2) PRIVATE
+sym(vp8_copy32xn_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;dst_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;dst_stride
+        movsxd          rcx,        dword ptr arg(4) ;height
+
+.block_copy_sse2_loopx4:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,        [rsi+rax*2]
+
+        movdqu          xmm4,       XMMWORD PTR [rsi]
+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
+
+        lea             rsi,    [rsi+rax*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        movdqa          XMMWORD PTR [rdi + rdx], xmm2
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
+
+        lea             rdi,    [rdi+rdx*2]
+
+        movdqa          XMMWORD PTR [rdi], xmm4
+        movdqa          XMMWORD PTR [rdi + 16], xmm5
+        movdqa          XMMWORD PTR [rdi + rdx], xmm6
+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
+
+        lea             rdi,    [rdi+rdx*2]
+
+        sub             rcx,     4
+        cmp             rcx,     4
+        jge             .block_copy_sse2_loopx4
+
+        cmp             rcx, 0
+        je              .copy_is_done
+
+.block_copy_sse2_loop:
+        movdqu          xmm0,       XMMWORD PTR [rsi]
+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
+        lea             rsi,    [rsi+rax]
+
+        movdqa          XMMWORD PTR [rdi], xmm0
+        movdqa          XMMWORD PTR [rdi + 16], xmm1
+        lea             rdi,    [rdi+rdx]
+
+        sub             rcx,     1
+        jne             .block_copy_sse2_loop
+
+.copy_is_done:
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse3.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse3.asm
new file mode 100644
index 00000000000..d789a40ccf7
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/copy_sse3.asm
@@ -0,0 +1,146 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%macro STACK_FRAME_CREATE_X3 0
+%if ABI_IS_32BIT
+  %define     src_ptr       rsi
+  %define     src_stride    rax
+  %define     ref_ptr       rdi
+  %define     ref_stride    rdx
+  %define     end_ptr       rcx
+  %define     ret_var       rbx
+  %define     result_ptr    arg(4)
+  %define     max_sad       arg(4)
+  %define     height        dword ptr arg(4)
+    push        rbp
+    mov         rbp,        rsp
+    push        rsi
+    push        rdi
+    push        rbx
+
+    mov         rsi,        arg(0)              ; src_ptr
+    mov         rdi,        arg(2)              ; ref_ptr
+
+    movsxd      rax,        dword ptr arg(1)    ; src_stride
+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
+%else
+  %if LIBVPX_YASM_WIN64
+    SAVE_XMM 7, u
+    %define     src_ptr     rcx
+    %define     src_stride  rdx
+    %define     ref_ptr     r8
+    %define     ref_stride  r9
+    %define     end_ptr     r10
+    %define     ret_var     r11
+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
+    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
+  %else
+    %define     src_ptr     rdi
+    %define     src_stride  rsi
+    %define     ref_ptr     rdx
+    %define     ref_stride  rcx
+    %define     end_ptr     r9
+    %define     ret_var     r10
+    %define     result_ptr  r8
+    %define     max_sad     r8
+    %define     height      r8
+  %endif
+%endif
+
+%endmacro
+
+%macro STACK_FRAME_DESTROY_X3 0
+  %define     src_ptr
+  %define     src_stride
+  %define     ref_ptr
+  %define     ref_stride
+  %define     end_ptr
+  %define     ret_var
+  %define     result_ptr
+  %define     max_sad
+  %define     height
+
+%if ABI_IS_32BIT
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    pop         rbp
+%else
+  %if LIBVPX_YASM_WIN64
+    RESTORE_XMM
+  %endif
+%endif
+    ret
+%endmacro
+
+
+;void vp8_copy32xn_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *dst_ptr,
+;    int  dst_stride,
+;    int height);
+global sym(vp8_copy32xn_sse3) PRIVATE
+sym(vp8_copy32xn_sse3):
+
+    STACK_FRAME_CREATE_X3
+
+.block_copy_sse3_loopx4:
+        lea             end_ptr,    [src_ptr+src_stride*2]
+
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
+        movdqu          xmm4,       XMMWORD PTR [end_ptr]
+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
+
+        lea             src_ptr,    [src_ptr+src_stride*4]
+
+        lea             end_ptr,    [ref_ptr+ref_stride*2]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
+        movdqa          XMMWORD PTR [end_ptr], xmm4
+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
+
+        lea             ref_ptr,    [ref_ptr+ref_stride*4]
+
+        sub             height,     4
+        cmp             height,     4
+        jge             .block_copy_sse3_loopx4
+
+        ;Check to see if there is more rows need to be copied.
+        cmp             height, 0
+        je              .copy_is_done
+
+.block_copy_sse3_loop:
+        movdqu          xmm0,       XMMWORD PTR [src_ptr]
+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
+        lea             src_ptr,    [src_ptr+src_stride]
+
+        movdqa          XMMWORD PTR [ref_ptr], xmm0
+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
+        lea             ref_ptr,    [ref_ptr+ref_stride]
+
+        sub             height,     1
+        jne             .block_copy_sse3_loop
+
+.copy_is_done:
+    STACK_FRAME_DESTROY_X3
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c
index a1e4ce6b329..f2532b34da2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/idct_blk_mmx.c
@@ -36,7 +36,7 @@ void vp8_dequant_idct_add_y_block_mmx
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
-            vpx_memset(q, 0, 2 * sizeof(q[0]));
+            memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -45,7 +45,7 @@ void vp8_dequant_idct_add_y_block_mmx
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
                                       dst+4, stride);
-            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+            memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[2] > 1)
@@ -54,7 +54,7 @@ void vp8_dequant_idct_add_y_block_mmx
         {
             vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
                                       dst+8, stride);
-            vpx_memset(q + 32, 0, 2 * sizeof(q[0]));
+            memset(q + 32, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[3] > 1)
@@ -63,7 +63,7 @@ void vp8_dequant_idct_add_y_block_mmx
         {
             vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
                                       dst+12, stride);
-            vpx_memset(q + 48, 0, 2 * sizeof(q[0]));
+            memset(q + 48, 0, 2 * sizeof(q[0]));
         }
 
         q    += 64;
@@ -85,7 +85,7 @@ void vp8_dequant_idct_add_uv_block_mmx
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
-            vpx_memset(q, 0, 2 * sizeof(q[0]));
+            memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -94,7 +94,7 @@ void vp8_dequant_idct_add_uv_block_mmx
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
                                       dstu+4, stride);
-            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+            memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         q    += 32;
@@ -109,7 +109,7 @@ void vp8_dequant_idct_add_uv_block_mmx
         else if (eobs[0] == 1)
         {
             vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
-            vpx_memset(q, 0, 2 * sizeof(q[0]));
+            memset(q, 0, 2 * sizeof(q[0]));
         }
 
         if (eobs[1] > 1)
@@ -118,7 +118,7 @@ void vp8_dequant_idct_add_uv_block_mmx
         {
             vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
                                       dstv+4, stride);
-            vpx_memset(q + 16, 0, 2 * sizeof(q[0]));
+            memset(q + 16, 0, 2 * sizeof(q[0]));
         }
 
         q    += 32;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse2.asm
deleted file mode 100644
index 8d86abc0758..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse2.asm
+++ /dev/null
@@ -1,410 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;unsigned int vp8_sad16x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad16x16_wmt) PRIVATE
-sym(vp8_sad16x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rax*8]
-
-        lea             rcx,        [rcx+rax*8]
-        pxor            xmm6,       xmm6
-
-.x16x16sad_wmt_loop:
-
-        movq            xmm0,       QWORD PTR [rsi]
-        movq            xmm2,       QWORD PTR [rsi+8]
-
-        movq            xmm1,       QWORD PTR [rdi]
-        movq            xmm3,       QWORD PTR [rdi+8]
-
-        movq            xmm4,       QWORD PTR [rsi+rax]
-        movq            xmm5,       QWORD PTR [rdi+rdx]
-
-
-        punpcklbw       xmm0,       xmm2
-        punpcklbw       xmm1,       xmm3
-
-        psadbw          xmm0,       xmm1
-        movq            xmm2,       QWORD PTR [rsi+rax+8]
-
-        movq            xmm3,       QWORD PTR [rdi+rdx+8]
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        punpcklbw       xmm4,       xmm2
-
-        punpcklbw       xmm5,       xmm3
-        psadbw          xmm4,       xmm5
-
-        paddw           xmm6,       xmm0
-        paddw           xmm6,       xmm4
-
-        cmp             rsi,        rcx
-        jne             .x16x16sad_wmt_loop
-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movq            rax,        xmm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp8_sad8x16_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_sad)
-global sym(vp8_sad8x16_wmt) PRIVATE
-sym(vp8_sad8x16_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-
-        lea             rcx,        [rcx+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x16sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        ja              .x8x16sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        movq            mm2,        QWORD PTR [rsi+rbx]
-        movq            mm3,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm2
-
-        cmp             rsi,        rcx
-        jne             .x8x16sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x8x16sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad8x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad8x8_wmt) PRIVATE
-sym(vp8_sad8x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x8x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        ja              .x8x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm1,        QWORD PTR [rdi]
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rbx]
-
-        add             rdi,        rdx
-        paddw           mm7,        mm0
-
-        cmp             rsi,        rcx
-        jne             .x8x8sad_wmt_loop
-
-        movq            rax,        mm7
-.x8x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;unsigned int vp8_sad4x4_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad4x4_wmt) PRIVATE
-sym(vp8_sad4x4_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        movd            mm0,        DWORD PTR [rsi]
-        movd            mm1,        DWORD PTR [rdi]
-
-        movd            mm2,        DWORD PTR [rsi+rax]
-        movd            mm3,        DWORD PTR [rdi+rdx]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        psadbw          mm0,        mm1
-        lea             rsi,        [rsi+rax*2]
-
-        lea             rdi,        [rdi+rdx*2]
-        movd            mm4,        DWORD PTR [rsi]
-
-        movd            mm5,        DWORD PTR [rdi]
-        movd            mm6,        DWORD PTR [rsi+rax]
-
-        movd            mm7,        DWORD PTR [rdi+rdx]
-        punpcklbw       mm4,        mm6
-
-        punpcklbw       mm5,        mm7
-        psadbw          mm4,        mm5
-
-        paddw           mm0,        mm4
-        movq            rax,        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_sad16x8_wmt(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride)
-global sym(vp8_sad16x8_wmt) PRIVATE
-sym(vp8_sad16x8_wmt):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        movsxd          rbx,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        lea             rcx,        [rsi+rbx*8]
-        pxor            mm7,        mm7
-
-.x16x8sad_wmt_loop:
-
-        movq            rax,        mm7
-        cmp             eax,        arg(4)
-        ja              .x16x8sad_wmt_early_exit
-
-        movq            mm0,        QWORD PTR [rsi]
-        movq            mm2,        QWORD PTR [rsi+8]
-
-        movq            mm1,        QWORD PTR [rdi]
-        movq            mm3,        QWORD PTR [rdi+8]
-
-        movq            mm4,        QWORD PTR [rsi+rbx]
-        movq            mm5,        QWORD PTR [rdi+rdx]
-
-        psadbw          mm0,        mm1
-        psadbw          mm2,        mm3
-
-        movq            mm1,        QWORD PTR [rsi+rbx+8]
-        movq            mm3,        QWORD PTR [rdi+rdx+8]
-
-        psadbw          mm4,        mm5
-        psadbw          mm1,        mm3
-
-        lea             rsi,        [rsi+rbx*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        paddw           mm0,        mm2
-        paddw           mm4,        mm1
-
-        paddw           mm7,        mm0
-        paddw           mm7,        mm4
-
-        cmp             rsi,        rcx
-        jne             .x16x8sad_wmt_loop
-
-        movq            rax,        mm7
-
-.x16x8sad_wmt_early_exit:
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    pop         rbx
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_copy32xn_sse2(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp8_copy32xn_sse2) PRIVATE
-sym(vp8_copy32xn_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;dst_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;dst_stride
-        movsxd          rcx,        dword ptr arg(4) ;height
-
-.block_copy_sse2_loopx4:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm2,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,        [rsi+rax*2]
-
-        movdqu          xmm4,       XMMWORD PTR [rsi]
-        movdqu          xmm5,       XMMWORD PTR [rsi + 16]
-        movdqu          xmm6,       XMMWORD PTR [rsi + rax]
-        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]
-
-        lea             rsi,    [rsi+rax*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        movdqa          XMMWORD PTR [rdi + rdx], xmm2
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3
-
-        lea             rdi,    [rdi+rdx*2]
-
-        movdqa          XMMWORD PTR [rdi], xmm4
-        movdqa          XMMWORD PTR [rdi + 16], xmm5
-        movdqa          XMMWORD PTR [rdi + rdx], xmm6
-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7
-
-        lea             rdi,    [rdi+rdx*2]
-
-        sub             rcx,     4
-        cmp             rcx,     4
-        jge             .block_copy_sse2_loopx4
-
-        cmp             rcx, 0
-        je              .copy_is_done
-
-.block_copy_sse2_loop:
-        movdqu          xmm0,       XMMWORD PTR [rsi]
-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]
-        lea             rsi,    [rsi+rax]
-
-        movdqa          XMMWORD PTR [rdi], xmm0
-        movdqa          XMMWORD PTR [rdi + 16], xmm1
-        lea             rdi,    [rdi+rdx]
-
-        sub             rcx,     1
-        jne             .block_copy_sse2_loop
-
-.copy_is_done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse3.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse3.asm
deleted file mode 100644
index 69c8d376973..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse3.asm
+++ /dev/null
@@ -1,960 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     max_sad       arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     max_sad     [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     max_sad     r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     max_sad
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro STACK_FRAME_CREATE_X4 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     r0_ptr        rcx
-  %define     r1_ptr        rdx
-  %define     r2_ptr        rbx
-  %define     r3_ptr        rdi
-  %define     ref_stride    rbp
-  %define     result_ptr    arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    push        rbp
-    mov         rdi,        arg(2)              ; ref_ptr_base
-
-    LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
-
-    mov         rsi,        arg(0)              ; src_ptr
-
-    movsxd      rbx,        dword ptr arg(1)    ; src_stride
-    movsxd      rbp,        dword ptr arg(3)    ; ref_stride
-
-    xchg        rbx,        rax
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     r0_ptr      rsi
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      r8
-    %define     ref_stride  r9
-    %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
-    push        rsi
-
-    LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     r0_ptr      r9
-    %define     r1_ptr      r10
-    %define     r2_ptr      r11
-    %define     r3_ptr      rdx
-    %define     ref_stride  rcx
-    %define     result_ptr  r8
-
-    LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
-
-  %endif
-%endif
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X4 0
-  %define     src_ptr
-  %define     src_stride
-  %define     r0_ptr
-  %define     r1_ptr
-  %define     r2_ptr
-  %define     r3_ptr
-  %define     ref_stride
-  %define     result_ptr
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    pop         rsi
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-%macro LOAD_X4_ADDRESSES 5
-        mov             %2,         [%1+REG_SZ_BYTES*0]
-        mov             %3,         [%1+REG_SZ_BYTES*1]
-
-        mov             %4,         [%1+REG_SZ_BYTES*2]
-        mov             %5,         [%1+REG_SZ_BYTES*3]
-%endmacro
-
-%macro PROCESS_16X2X4 8
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm4,       XMMWORD PTR [%3]
-        lddqu           xmm5,       XMMWORD PTR [%4]
-        lddqu           xmm6,       XMMWORD PTR [%5]
-        lddqu           xmm7,       XMMWORD PTR [%6]
-
-        psadbw          xmm4,       xmm0
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%4]
-        lddqu           xmm3,       XMMWORD PTR [%5]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%7]
-        lddqu           xmm1,       XMMWORD PTR [%3+%8]
-        lddqu           xmm2,       XMMWORD PTR [%4+%8]
-        lddqu           xmm3,       XMMWORD PTR [%5+%8]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm4,       xmm1
-        lddqu           xmm1,       XMMWORD PTR [%6+%8]
-        paddw           xmm5,       xmm2
-        paddw           xmm6,       xmm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          xmm1,       xmm0
-        paddw           xmm7,       xmm1
-
-%endmacro
-
-%macro PROCESS_8X2X4 8
-%if %1==0
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm4,        QWORD PTR [%3]
-        movq            mm5,        QWORD PTR [%4]
-        movq            mm6,        QWORD PTR [%5]
-        movq            mm7,        QWORD PTR [%6]
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-        psadbw          mm7,        mm0
-%else
-        movq            mm0,        QWORD PTR [%2]
-        movq            mm1,        QWORD PTR [%3]
-        movq            mm2,        QWORD PTR [%4]
-        movq            mm3,        QWORD PTR [%5]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-%endif
-        movq            mm0,        QWORD PTR [%2+%7]
-        movq            mm1,        QWORD PTR [%3+%8]
-        movq            mm2,        QWORD PTR [%4+%8]
-        movq            mm3,        QWORD PTR [%5+%8]
-
-        psadbw          mm1,        mm0
-        psadbw          mm2,        mm0
-        psadbw          mm3,        mm0
-
-        paddw           mm4,        mm1
-        movq            mm1,        QWORD PTR [%6+%8]
-        paddw           mm5,        mm2
-        paddw           mm6,        mm3
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%7*2]
-        lea             %3,         [%3+%8*2]
-
-        lea             %4,         [%4+%8*2]
-        lea             %5,         [%5+%8*2]
-
-        lea             %6,         [%6+%8*2]
-%endif
-        psadbw          mm1,        mm0
-        paddw           mm7,        mm1
-
-%endmacro
-
-;void int vp8_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x16x3_sse3) PRIVATE
-sym(vp8_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x8x3_sse3) PRIVATE
-sym(vp8_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x16x3_sse3) PRIVATE
-sym(vp8_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x8x3_sse3) PRIVATE
-sym(vp8_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vp8_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad4x4x3_sse3) PRIVATE
-sym(vp8_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;unsigned int vp8_sad16x16_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  max_sad)
-;%define lddqu movdqu
-global sym(vp8_sad16x16_sse3) PRIVATE
-sym(vp8_sad16x16_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        mov             end_ptr,    4
-        pxor            xmm7,        xmm7
-
-.vp8_sad16x16_sse3_loop:
-        movdqa          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
-        movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          xmm4,       XMMWORD PTR [src_ptr]
-        movdqu          xmm5,       XMMWORD PTR [ref_ptr]
-        movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
-
-        psadbw          xmm0,       xmm1
-
-        movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
-
-        psadbw          xmm2,       xmm3
-        psadbw          xmm4,       xmm5
-        psadbw          xmm6,       xmm1
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        paddw           xmm7,        xmm0
-        paddw           xmm7,        xmm2
-        paddw           xmm7,        xmm4
-        paddw           xmm7,        xmm6
-
-        sub             end_ptr,     1
-        jne             .vp8_sad16x16_sse3_loop
-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-        paddw           xmm0,       xmm7
-        movq            rax,        xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void vp8_copy32xn_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *dst_ptr,
-;    int  dst_stride,
-;    int height);
-global sym(vp8_copy32xn_sse3) PRIVATE
-sym(vp8_copy32xn_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-.block_copy_sse3_loopx4:
-        lea             end_ptr,    [src_ptr+src_stride*2]
-
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
-        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
-        movdqu          xmm4,       XMMWORD PTR [end_ptr]
-        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
-        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
-        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
-
-        lea             src_ptr,    [src_ptr+src_stride*4]
-
-        lea             end_ptr,    [ref_ptr+ref_stride*2]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
-        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
-        movdqa          XMMWORD PTR [end_ptr], xmm4
-        movdqa          XMMWORD PTR [end_ptr + 16], xmm5
-        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
-        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
-
-        lea             ref_ptr,    [ref_ptr+ref_stride*4]
-
-        sub             height,     4
-        cmp             height,     4
-        jge             .block_copy_sse3_loopx4
-
-        ;Check to see if there is more rows need to be copied.
-        cmp             height, 0
-        je              .copy_is_done
-
-.block_copy_sse3_loop:
-        movdqu          xmm0,       XMMWORD PTR [src_ptr]
-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
-        lea             src_ptr,    [src_ptr+src_stride]
-
-        movdqa          XMMWORD PTR [ref_ptr], xmm0
-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
-        lea             ref_ptr,    [ref_ptr+ref_stride]
-
-        sub             height,     1
-        jne             .block_copy_sse3_loop
-
-.copy_is_done:
-    STACK_FRAME_DESTROY_X3
-
-;void vp8_sad16x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x16x4d_sse3) PRIVATE
-sym(vp8_sad16x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void vp8_sad16x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr_base,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad16x8x4d_sse3) PRIVATE
-sym(vp8_sad16x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm4
-        psrldq          xmm4,       8
-
-        paddw           xmm0,       xmm4
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+8],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+12],   xmm0
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad8x16x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x16x4d_sse3) PRIVATE
-sym(vp8_sad8x16x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad8x8x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad8x8x4d_sse3) PRIVATE
-sym(vp8_sad8x8x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-        PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
-
-%if ABI_IS_32BIT
-        pop             rbp
-%endif
-        mov             rcx,        result_ptr
-
-        punpckldq       mm4,        mm5
-        punpckldq       mm6,        mm7
-
-        movq            [rcx],      mm4
-        movq            [rcx+8],    mm6
-
-    STACK_FRAME_DESTROY_X4
-
-;void int vp8_sad4x4x4d_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp8_sad4x4x4d_sse3) PRIVATE
-sym(vp8_sad4x4x4d_sse3):
-
-    STACK_FRAME_CREATE_X4
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [r0_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [r1_ptr]
-        movd            mm5,        DWORD PTR [r2_ptr]
-
-        movd            mm6,        DWORD PTR [r3_ptr]
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-
-        movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
-        movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        punpcklbw       mm6,        mm7
-        psadbw          mm4,        mm0
-
-        psadbw          mm5,        mm0
-        psadbw          mm6,        mm0
-
-
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             r0_ptr,     [r0_ptr+ref_stride*2]
-
-        lea             r1_ptr,     [r1_ptr+ref_stride*2]
-        lea             r2_ptr,     [r2_ptr+ref_stride*2]
-
-        lea             r3_ptr,     [r3_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [r0_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm7
-
-        movd            mm3,        DWORD PTR [r1_ptr]
-        movd            mm7,        DWORD PTR [r2_ptr]
-
-        psadbw          mm2,        mm0
-%if ABI_IS_32BIT
-        mov             rax,        rbp
-
-        pop             rbp
-%define     ref_stride    rax
-%endif
-        mov             rsi,        result_ptr
-
-        paddw           mm1,        mm2
-        movd            [rsi],      mm1
-
-        movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
-        movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm1
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        movd            mm2,        DWORD PTR [r3_ptr]
-        movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        movd            [rsi+4],    mm3
-        punpcklbw       mm2,        mm1
-
-        movd            [rsi+8],    mm7
-        psadbw          mm2,        mm0
-
-        paddw           mm2,        mm6
-        movd            [rsi+12],   mm2
-
-
-    STACK_FRAME_DESTROY_X4
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse4.asm b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse4.asm
deleted file mode 100644
index f7fccd77c58..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_sse4.asm
+++ /dev/null
@@ -1,353 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-
-;void vp8_sad16x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-global sym(vp8_sad16x16x8_sse4) PRIVATE
-sym(vp8_sad16x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad16x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad16x8x8_sse4) PRIVATE
-sym(vp8_sad16x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_16X2X8 1
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-        PROCESS_16X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad8x8x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad8x8x8_sse4) PRIVATE
-sym(vp8_sad8x8x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad8x16x8_sse4(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad8x16x8_sse4) PRIVATE
-sym(vp8_sad8x16x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_8X2X8 1
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        PROCESS_8X2X8 0
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_sad4x4x8_c(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-global sym(vp8_sad4x4x8_sse4) PRIVATE
-sym(vp8_sad4x4x8_sse4):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov             rsi,        arg(0)           ;src_ptr
-        mov             rdi,        arg(2)           ;ref_ptr
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        PROCESS_4X2X8 1
-        PROCESS_4X2X8 0
-
-        mov             rdi,        arg(4)           ;Results
-        movdqa          XMMWORD PTR [rdi],    xmm1
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c
index b4092938161..fb0b57eb1c1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -127,7 +127,7 @@ void vp8_sixtap_predict4x4_mmx
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[16*16]);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
     HFilter = vp8_six_tap_mmx[xoffset];
     vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
@@ -148,7 +148,7 @@ void vp8_sixtap_predict16x16_mmx
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);  /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -180,7 +180,7 @@ void vp8_sixtap_predict8x8_mmx
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -206,7 +206,7 @@ void vp8_sixtap_predict8x4_mmx
 )
 {
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -252,7 +252,7 @@ void vp8_sixtap_predict16x16_sse2
 
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24);    /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[24*24]);    /* Temp data bufffer used in filtering */
 
     const short *HFilter, *VFilter;
 
@@ -292,7 +292,7 @@ void vp8_sixtap_predict8x8_sse2
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
 
     if (xoffset)
@@ -330,7 +330,7 @@ void vp8_sixtap_predict8x4_sse2
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256);  /* Temp data bufffer used in filtering */
+    DECLARE_ALIGNED(16, unsigned short, FData2[256]);  /* Temp data bufffer used in filtering */
     const short *HFilter, *VFilter;
 
     if (xoffset)
@@ -432,7 +432,7 @@ void vp8_sixtap_predict16x16_ssse3
 
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
+    DECLARE_ALIGNED(16, unsigned char, FData2[24*24]);
 
     if (xoffset)
     {
@@ -480,7 +480,7 @@ void vp8_sixtap_predict8x8_ssse3
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
 
     if (xoffset)
     {
@@ -528,7 +528,7 @@ void vp8_sixtap_predict8x4_ssse3
     int dst_pitch
 )
 {
-    DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
+    DECLARE_ALIGNED(16, unsigned char, FData2[256]);
 
     if (xoffset)
     {
@@ -576,7 +576,7 @@ void vp8_sixtap_predict4x4_ssse3
     int dst_pitch
 )
 {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
+  DECLARE_ALIGNED(16, unsigned char, FData2[4*9]);
 
   if (xoffset)
   {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
index e7cf0d9b9c6..fb300fe8827 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodeframe.c
@@ -101,6 +101,8 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     int i;
 #if CONFIG_ERROR_CONCEALMENT
     int corruption_detected = 0;
+#else
+    (void)mb_idx;
 #endif
 
     if (xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -140,7 +142,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
              * Better to use the predictor as reconstruction.
              */
             pbi->frame_corrupt_residual = 1;
-            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
             vp8_conceal_corrupt_mb(xd);
 
 
@@ -149,7 +151,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
             /* force idct to be skipped for B_PRED and use the
              * prediction only for reconstruction
              * */
-            vpx_memset(xd->eobs, 0, 25);
+            memset(xd->eobs, 0, 25);
         }
     }
 #endif
@@ -182,7 +184,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
             /* clear out residual eob info */
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
-                vpx_memset(xd->eobs, 0, 25);
+                memset(xd->eobs, 0, 25);
 
             intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
 
@@ -212,7 +214,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                             (b->qcoeff[0] * DQC[0],
                                 dst, dst_stride,
                                 dst, dst_stride);
-                        vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                        memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                     }
                 }
             }
@@ -249,14 +251,14 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
                     vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
                 }
                 else
                 {
                     b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                     vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                 }
 
                 /* override the dc dequant constant in order to preserve the
@@ -321,7 +323,7 @@ static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
 
     for (i = 0; i < (int)Border; i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
         dest_ptr1 += plane_stride;
     }
 
@@ -336,7 +338,7 @@ static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
         dest_ptr1 += plane_stride;
     }
 
@@ -349,7 +351,7 @@ static void yv12_extend_frame_top_c(YV12_BUFFER_CONFIG *ybf)
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr1, src_ptr1, plane_stride);
+        memcpy(dest_ptr1, src_ptr1, plane_stride);
         dest_ptr1 += plane_stride;
     }
 }
@@ -377,7 +379,7 @@ static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
 
     for (i = 0; i < (int)Border; i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
         dest_ptr2 += plane_stride;
     }
 
@@ -395,7 +397,7 @@ static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
         dest_ptr2 += plane_stride;
     }
 
@@ -409,7 +411,7 @@ static void yv12_extend_frame_bottom_c(YV12_BUFFER_CONFIG *ybf)
 
     for (i = 0; i < (int)(Border); i++)
     {
-        vpx_memcpy(dest_ptr2, src_ptr2, plane_stride);
+        memcpy(dest_ptr2, src_ptr2, plane_stride);
         dest_ptr2 += plane_stride;
     }
 }
@@ -444,8 +446,8 @@ static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
 
     for (i = 0; i < plane_height; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], Border);
-        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
         src_ptr1  += plane_stride;
         src_ptr2  += plane_stride;
         dest_ptr1 += plane_stride;
@@ -468,8 +470,8 @@ static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
 
     for (i = 0; i < plane_height; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], Border);
-        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
         src_ptr1  += plane_stride;
         src_ptr2  += plane_stride;
         dest_ptr1 += plane_stride;
@@ -488,8 +490,8 @@ static void yv12_extend_frame_left_right_c(YV12_BUFFER_CONFIG *ybf,
 
     for (i = 0; i < plane_height; i++)
     {
-        vpx_memset(dest_ptr1, src_ptr1[0], Border);
-        vpx_memset(dest_ptr2, src_ptr2[0], Border);
+        memset(dest_ptr1, src_ptr1[0], Border);
+        memset(dest_ptr2, src_ptr2[0], Border);
         src_ptr1  += plane_stride;
         src_ptr2  += plane_stride;
         dest_ptr1 += plane_stride;
@@ -566,7 +568,7 @@ static void decode_mb_rows(VP8D_COMP *pbi)
 
         /* reset contexts */
         xd->above_context = pc->above_context;
-        vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
 
         xd->left_available = 0;
 
@@ -916,19 +918,19 @@ static void init_frame(VP8D_COMP *pbi)
     if (pc->frame_type == KEY_FRAME)
     {
         /* Various keyframe initializations */
-        vpx_memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+        memcpy(pc->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
 
         vp8_init_mbmode_probs(pc);
 
         vp8_default_coef_probs(pc);
 
         /* reset the segment feature data to 0 with delta coding (Default state). */
-        vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+        memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
         xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
 
         /* reset the mode ref deltasa for loop filter */
-        vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
-        vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
+        memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
+        memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
 
         /* All buffers are implicitly updated on key frames. */
         pc->refresh_golden_frame = 1;
@@ -1067,12 +1069,11 @@ int vp8_decode_frame(VP8D_COMP *pbi)
                 pc->vert_scale = clear[6] >> 6;
             }
             data += 7;
-            clear += 7;
         }
         else
         {
-          vpx_memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
-          vpx_memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+          memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
+          memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
         }
     }
     if ((!pbi->decoded_key_frame && pc->frame_type != KEY_FRAME))
@@ -1104,7 +1105,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         {
             xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
 
-            vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
+            memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
             /* For each segmentation feature (Quant and loop filter level) */
             for (i = 0; i < MB_LVL_MAX; i++)
@@ -1128,7 +1129,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
         if (xd->update_mb_segmentation_map)
         {
             /* Which macro block level features are enabled */
-            vpx_memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
+            memset(xd->mb_segment_tree_probs, 255, sizeof(xd->mb_segment_tree_probs));
 
             /* Read the probs used to decode the segment id for each macro block. */
             for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
@@ -1277,7 +1278,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 #endif
     if (pc->refresh_entropy_probs == 0)
     {
-        vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
+        memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));
     }
 
     pc->refresh_last_frame = pc->frame_type == KEY_FRAME  ||  vp8_read_bit(bc);
@@ -1326,7 +1327,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     }
 
     /* clear out the coeff buffer */
-    vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+    memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
 
     vp8_decode_mode_mvs(pbi);
 
@@ -1340,7 +1341,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
     }
 #endif
 
-    vpx_memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
+    memset(pc->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * pc->mb_cols);
     pbi->frame_corrupt_residual = 0;
 
 #if CONFIG_MULTITHREAD
@@ -1379,7 +1380,7 @@ int vp8_decode_frame(VP8D_COMP *pbi)
 
     if (pc->refresh_entropy_probs == 0)
     {
-        vpx_memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
+        memcpy(&pc->fc, &pc->lfc, sizeof(pc->fc));
         pbi->independent_partitions = prev_independent_partitions;
     }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c
index 35a22c7de5b..1d155e7e16d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/decodemv.c
@@ -591,6 +591,8 @@ static void read_mb_features(vp8_reader *r, MB_MODE_INFO *mi, MACROBLOCKD *x)
 static void decode_mb_mode_mvs(VP8D_COMP *pbi, MODE_INFO *mi,
                                MB_MODE_INFO *mbmi)
 {
+    (void)mbmi;
+
     /* Read the Macroblock segmentation map if it is being updated explicitly
      * this frame (reset to 0 above by default)
      * By default on a key frame reset all MBs to segment 0
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c
index 452ff6cba3a..fcc7533c50f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/detokenize.c
@@ -20,8 +20,8 @@ void vp8_reset_mb_tokens_context(MACROBLOCKD *x)
     ENTROPY_CONTEXT *a_ctx = ((ENTROPY_CONTEXT *)x->above_context);
     ENTROPY_CONTEXT *l_ctx = ((ENTROPY_CONTEXT *)x->left_context);
 
-    vpx_memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
-    vpx_memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    memset(a_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+    memset(l_ctx, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
 
     /* Clear entropy contexts for Y2 blocks */
     if (!x->mode_info_context->mbmi.is_4x4)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
index 4b304c83c78..bb6d443c475 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/error_concealment.c
@@ -350,7 +350,7 @@ static void estimate_missing_mvs(MB_OVERLAP *overlaps,
                                  unsigned int first_corrupt)
 {
     int mb_row, mb_col;
-    vpx_memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
+    memset(overlaps, 0, sizeof(MB_OVERLAP) * mb_rows * mb_cols);
     /* First calculate the overlaps for all blocks */
     for (mb_row = 0; mb_row < mb_rows; ++mb_row)
     {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
index 1d763b6bfae..d7b8c76dc26 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_if.c
@@ -58,7 +58,7 @@ static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
     if (!pbi)
         return NULL;
 
-    vpx_memset(pbi, 0, sizeof(VP8D_COMP));
+    memset(pbi, 0, sizeof(VP8D_COMP));
 
     if (setjmp(pbi->common.error.jmp))
     {
@@ -87,6 +87,7 @@ static struct VP8D_COMP * create_decompressor(VP8D_CONFIG *oxcf)
     pbi->ec_enabled = oxcf->error_concealment;
     pbi->overlaps = NULL;
 #else
+    (void)oxcf;
     pbi->ec_enabled = 0;
 #endif
     /* Error concealment is activated after a key frame has been
@@ -303,6 +304,8 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
 {
     VP8_COMMON *cm = &pbi->common;
     int retcode = -1;
+    (void)size;
+    (void)source;
 
     pbi->common.error.error_code = VPX_CODEC_OK;
 
@@ -407,6 +410,7 @@ int vp8dx_get_raw_frame(VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd, int64_t *time_st
 #if CONFIG_POSTPROC
     ret = vp8_post_proc_frame(&pbi->common, sd, flags);
 #else
+    (void)flags;
 
     if (pbi->common.frame_to_show)
     {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
index fe290cffec7..6801532f118 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c
@@ -60,12 +60,12 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
 
         mbd->segmentation_enabled    = xd->segmentation_enabled;
         mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
-        vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+        memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
 
         /*signed char ref_lf_deltas[MAX_REF_LF_DELTAS];*/
-        vpx_memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
+        memcpy(mbd->ref_lf_deltas, xd->ref_lf_deltas, sizeof(xd->ref_lf_deltas));
         /*signed char mode_lf_deltas[MAX_MODE_LF_DELTAS];*/
-        vpx_memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
+        memcpy(mbd->mode_lf_deltas, xd->mode_lf_deltas, sizeof(xd->mode_lf_deltas));
         /*unsigned char mode_ref_lf_delta_enabled;
         unsigned char mode_ref_lf_delta_update;*/
         mbd->mode_ref_lf_delta_enabled    = xd->mode_ref_lf_delta_enabled;
@@ -73,10 +73,10 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_D
 
         mbd->current_bc = &pbi->mbc[0];
 
-        vpx_memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
-        vpx_memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
-        vpx_memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
-        vpx_memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+        memcpy(mbd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        memcpy(mbd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
         mbd->fullpixel_mask = 0xffffffff;
 
@@ -96,6 +96,8 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
     int i;
 #if CONFIG_ERROR_CONCEALMENT
     int corruption_detected = 0;
+#else
+    (void)mb_idx;
 #endif
 
     if (xd->mode_info_context->mbmi.mb_skip_coeff)
@@ -135,7 +137,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
              * Better to use the predictor as reconstruction.
              */
             pbi->frame_corrupt_residual = 1;
-            vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+            memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
             vp8_conceal_corrupt_mb(xd);
 
 
@@ -144,7 +146,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
             /* force idct to be skipped for B_PRED and use the
              * prediction only for reconstruction
              * */
-            vpx_memset(xd->eobs, 0, 25);
+            memset(xd->eobs, 0, 25);
         }
     }
 #endif
@@ -177,7 +179,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
             /* clear out residual eob info */
             if(xd->mode_info_context->mbmi.mb_skip_coeff)
-                vpx_memset(xd->eobs, 0, 25);
+                memset(xd->eobs, 0, 25);
 
             intra_prediction_down_copy(xd, xd->recon_above[0] + 16);
 
@@ -227,7 +229,7 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
                     {
                         vp8_dc_only_idct_add(b->qcoeff[0] * DQC[0],
                                              dst, dst_stride, dst, dst_stride);
-                        vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                        memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                     }
                 }
             }
@@ -264,14 +266,14 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
                     vp8_short_inv_walsh4x4(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 16 * sizeof(b->qcoeff[0]));
                 }
                 else
                 {
                     b->dqcoeff[0] = b->qcoeff[0] * xd->dequant_y2[0];
                     vp8_short_inv_walsh4x4_1(&b->dqcoeff[0],
                         xd->qcoeff);
-                    vpx_memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
+                    memset(b->qcoeff, 0, 2 * sizeof(b->qcoeff[0]));
                 }
 
                 /* override the dc dequant constant in order to preserve the
@@ -358,7 +360,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
 
        /* reset contexts */
        xd->above_context = pc->above_context;
-       vpx_memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+       memset(xd->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
 
        xd->left_available = 0;
 
@@ -497,9 +499,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row)
                if( mb_row != pc->mb_rows-1 )
                {
                    /* Save decoded MB last row data for next-row decoding */
-                   vpx_memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
-                   vpx_memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
-                   vpx_memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
+                   memcpy((pbi->mt_yabove_row[mb_row + 1] + 32 + mb_col*16), (xd->dst.y_buffer + 15 * recon_y_stride), 16);
+                   memcpy((pbi->mt_uabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.u_buffer + 7 * recon_uv_stride), 8);
+                   memcpy((pbi->mt_vabove_row[mb_row + 1] + 16 + mb_col*8), (xd->dst.v_buffer + 7 * recon_uv_stride), 8);
                }
 
                /* save left_col for next MB decoding */
@@ -874,23 +876,23 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
     if (filter_level)
     {
         /* Set above_row buffer to 127 for decoding first MB row */
-        vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
-        vpx_memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
-        vpx_memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+        memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, yv12_fb_new->y_width + 5);
+        memset(pbi->mt_uabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
+        memset(pbi->mt_vabove_row[0] + (VP8BORDERINPIXELS>>1)-1, 127, (yv12_fb_new->y_width>>1) +5);
 
         for (j=1; j<pc->mb_rows; j++)
         {
-            vpx_memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
-            vpx_memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
-            vpx_memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            memset(pbi->mt_yabove_row[j] + VP8BORDERINPIXELS-1, (unsigned char)129, 1);
+            memset(pbi->mt_uabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
+            memset(pbi->mt_vabove_row[j] + (VP8BORDERINPIXELS>>1)-1, (unsigned char)129, 1);
         }
 
         /* Set left_col to 129 initially */
         for (j=0; j<pc->mb_rows; j++)
         {
-            vpx_memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
-            vpx_memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
-            vpx_memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
+            memset(pbi->mt_yleft_col[j], (unsigned char)129, 16);
+            memset(pbi->mt_uleft_col[j], (unsigned char)129, 8);
+            memset(pbi->mt_vleft_col[j], (unsigned char)129, 8);
         }
 
         /* Initialize the loop filter for this frame. */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
deleted file mode 100644
index 4abe818f188..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ /dev/null
@@ -1,310 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8_start_encode|
-    EXPORT |vp8_encode_bool|
-    EXPORT |vp8_stop_encode|
-    EXPORT |vp8_encode_value|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-; r0 BOOL_CODER *br
-; r1 unsigned char *source
-; r2 unsigned char *source_end
-|vp8_start_encode| PROC
-    str     r2,  [r0, #vp8_writer_buffer_end]
-    mov     r12, #0
-    mov     r3,  #255
-    mvn     r2,  #23
-    str     r12, [r0, #vp8_writer_lowvalue]
-    str     r3,  [r0, #vp8_writer_range]
-    str     r2,  [r0, #vp8_writer_count]
-    str     r12, [r0, #vp8_writer_pos]
-    str     r1,  [r0, #vp8_writer_buffer]
-    bx      lr
-    ENDP
-
-; r0 BOOL_CODER *br
-; r1 int bit
-; r2 int probability
-|vp8_encode_bool| PROC
-    push    {r4-r10, lr}
-
-    mov     r4, r2
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-    sub     r7, r5, #1                  ; range-1
-
-    cmp     r1, #0
-    mul     r6, r4, r7                  ; ((range-1) * probability)
-
-    mov     r7, #1
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * probability) >> 8)
-
-    addne   r2, r2, r4                  ; if  (bit) lowvalue += split
-    subne   r4, r5, r4                  ; if  (bit) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r1, [r7, r4]
-    cmpge   r1, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r1, r4, #1                  ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r1, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r9, r1                 ; validate_buffer at pos
-
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r10, pc}
-    ENDP
-
-; r0 BOOL_CODER *br
-|vp8_stop_encode| PROC
-    push    {r4-r10, lr}
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-    mov     r10, #32
-
-stop_encode_loop
-    sub     r7, r5, #1                  ; range-1
-
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_se      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_se
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_se
-token_zero_while_loop_se
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_se
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r1, [r7, r4]
-    cmpge   r1, #0xff
-    beq     token_zero_while_loop_se
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set_se
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r1, r4, #1                  ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r1, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r9, r1                 ; validate_buffer at pos
-
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r10, r10, #1
-    bne     stop_encode_loop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r10, pc}
-
-    ENDP
-
-; r0 BOOL_CODER *br
-; r1 int data
-; r2 int bits
-|vp8_encode_value| PROC
-    push    {r4-r12, lr}
-
-    mov     r10, r2
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-    rsb     r4, r10, #32                 ; 32-n
-
-    ; v is kept in r1 during the token pack loop
-    lsl     r1, r1, r4                  ; r1 = v << 32 - n
-
-encode_value_loop
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r1, r1, #1                  ; bit = v >> n
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bit) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_ev      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_ev
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_ev
-token_zero_while_loop_ev
-    mov     r9, #0
-    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_ev
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop_ev
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r9, [r7, r4]                ; w->buffer[x]
-    add     r9, r9, #1
-    strb    r9, [r7, r4]                ; w->buffer[x] + 1
-token_high_bit_not_set_ev
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r9, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r9, r11                ; validate_buffer at pos
-
-    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
-
-token_count_lt_zero_ev
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r10, r10, #1
-    bne     encode_value_loop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r12, pc}
-    ENDP
-
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
deleted file mode 100644
index 90a141c6248..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ /dev/null
@@ -1,317 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_tokens_armv5|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-
-; r0 vp8_writer *w
-; r1 const TOKENEXTRA *p
-; r2 int xcount
-; r3 vp8_coef_encodings
-; s0 vp8_extra_bits
-; s1 vp8_coef_tree
-|vp8cx_pack_tokens_armv5| PROC
-    push    {r4-r12, lr}
-    sub     sp, sp, #16
-
-    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
-    ;  sizeof (TOKENEXTRA) is 8
-    add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
-    str     r2, [sp, #0]
-    str     r3, [sp, #8]                ; save vp8_coef_encodings
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-    b       check_p_lt_stop
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #8]                ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp8_token_value]  ; v
-    ldr     r8, [r4, #vp8_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #56]               ; vp8_extra_bits
-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp8_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp8_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7
-    ldr     r7, [r0, #vp8_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #vp8_writer_pos]
-
-    VALIDATE_POS r7, r12               ; validate_buffer at pos
-
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    add     sp, sp, #16
-    pop     {r4-r12, pc}
-    ENDP
-
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
deleted file mode 100644
index 3a8d17a81b1..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ /dev/null
@@ -1,352 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_mb_row_tokens_armv5|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-; r0 VP8_COMP *cpi
-; r1 vp8_writer *w
-; r2 vp8_coef_encodings
-; r3 vp8_extra_bits
-; s0 vp8_coef_tree
-
-|vp8cx_pack_mb_row_tokens_armv5| PROC
-    push    {r4-r12, lr}
-    sub     sp, sp, #24
-
-    ; Compute address of cpi->common.mb_rows
-    ldr     r4, _VP8_COMP_common_
-    ldr     r6, _VP8_COMMON_MBrows_
-    add     r4, r0, r4
-
-    ldr     r5, [r4, r6]                ; load up mb_rows
-
-    str     r2, [sp, #20]               ; save vp8_coef_encodings
-    str     r5, [sp, #12]               ; save mb_rows
-    str     r3, [sp, #8]                ; save vp8_extra_bits
-
-    ldr     r4, _VP8_COMP_tplist_
-    add     r4, r0, r4
-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
-
-    mov     r0, r1                      ; keep same as other loops
-
-    ldr     r2, [r0, #vp8_writer_lowvalue]
-    ldr     r5, [r0, #vp8_writer_range]
-    ldr     r3, [r0, #vp8_writer_count]
-
-mb_row_loop
-
-    ldr     r1, [r7, #tokenlist_start]
-    ldr     r9, [r7, #tokenlist_stop]
-    str     r9, [sp, #0]                ; save stop for later comparison
-    str     r7, [sp, #16]               ; tokenlist address for next time
-
-    b       check_p_lt_stop
-
-    ; actuall work gets done here!
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #20]               ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp8_token_value]  ; v
-    ldr     r8, [r4, #vp8_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #64]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                 ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #64]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #8]                ; vp8_extra_bits
-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp8_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp8_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7
-    ldr     r7, [r0, #vp8_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #vp8_writer_pos]
-
-    VALIDATE_POS r7, r12               ; validate_buffer at pos
-
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    ldr     r6, [sp, #12]               ; mb_rows
-    ldr     r7, [sp, #16]               ; tokenlist address
-    subs    r6, r6, #1
-    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
-    str     r6, [sp, #12]
-    bne     mb_row_loop
-
-    str     r2, [r0, #vp8_writer_lowvalue]
-    str     r5, [r0, #vp8_writer_range]
-    str     r3, [r0, #vp8_writer_count]
-    add     sp, sp, #24
-    pop     {r4-r12, pc}
-    ENDP
-
-_VP8_COMP_common_
-    DCD     vp8_comp_common
-_VP8_COMMON_MBrows_
-    DCD     vp8_common_mb_rows
-_VP8_COMP_tplist_
-    DCD     vp8_comp_tplist
-
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
deleted file mode 100644
index e9aa4958f30..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ /dev/null
@@ -1,471 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
-    IMPORT |vp8_validate_buffer_arm|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA    |.text|, CODE, READONLY
-
-    ; macro for validating write buffer position
-    ; needs vp8_writer in r0
-    ; start shall not be in r1
-    MACRO
-    VALIDATE_POS $start, $pos
-    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
-    ldr  r2, [r0, #vp8_writer_buffer_end]
-    ldr  r3, [r0, #vp8_writer_error]
-    mov  r1, $pos
-    mov  r0, $start
-    bl   vp8_validate_buffer_arm
-    pop  {r0-r3, r12, lr}
-    MEND
-
-; r0 VP8_COMP *cpi
-; r1 unsigned char *cx_data
-; r2 const unsigned char *cx_data_end
-; r3 int num_part
-; s0 vp8_coef_encodings
-; s1 vp8_extra_bits,
-; s2 const vp8_tree_index *
-
-|vp8cx_pack_tokens_into_partitions_armv5| PROC
-    push    {r4-r12, lr}
-    sub     sp, sp, #40
-
-    ; Compute address of cpi->common.mb_rows
-    ldr     r4, _VP8_COMP_common_
-    ldr     r6, _VP8_COMMON_MBrows_
-    add     r4, r0, r4
-
-    ldr     r5, [r4, r6]                ; load up mb_rows
-
-    str     r5, [sp, #36]               ; save mb_rows
-    str     r1, [sp, #24]               ; save ptr = cx_data
-    str     r3, [sp, #20]               ; save num_part
-    str     r2, [sp, #8]                ; save cx_data_end
-
-    ldr     r4, _VP8_COMP_tplist_
-    add     r4, r0, r4
-    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
-    str     r7, [sp, #32]               ; store start of cpi->tp_list
-
-    ldr     r11, _VP8_COMP_bc_          ; load up vp8_writer out of cpi
-    add     r0, r0, r11
-
-    mov     r11, #0
-    str     r11, [sp, #28]              ; i
-
-numparts_loop
-    ldr     r2, _vp8_writer_sz_         ; load up sizeof(vp8_writer)
-    add     r0, r2                      ; bc[i + 1]
-
-    ldr     r10, [sp, #24]              ; ptr
-    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
-    subs    r5, r5, r11                 ; move start point with each partition
-                                        ; mb_rows starts at i
-    str     r5,  [sp, #12]
-
-    ; Reset all of the VP8 Writer data for each partition that
-    ; is processed.
-    ; start_encode
-
-    ldr     r3, [sp, #8]
-    str     r3, [r0, #vp8_writer_buffer_end]
-
-    mov     r2, #0                      ; vp8_writer_lowvalue
-    mov     r5, #255                    ; vp8_writer_range
-    mvn     r3, #23                     ; vp8_writer_count
-
-    str     r2,  [r0, #vp8_writer_pos]
-    str     r10, [r0, #vp8_writer_buffer]
-
-    ble     end_partition               ; if (mb_rows <= 0) end partition
-
-mb_row_loop
-
-    ldr     r1, [r7, #tokenlist_start]
-    ldr     r9, [r7, #tokenlist_stop]
-    str     r9, [sp, #0]                ; save stop for later comparison
-    str     r7, [sp, #16]               ; tokenlist address for next time
-
-    b       check_p_lt_stop
-
-    ; actual work gets done here!
-
-while_p_lt_stop
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r4, [sp, #80]               ; vp8_coef_encodings
-    mov     lr, #0
-    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
-    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
-
-    ldrb    r7, [r1, #tokenextra_skip_eob_node]
-
-    ldr     r6, [r4, #vp8_token_value]  ; v
-    ldr     r8, [r4, #vp8_token_len]    ; n
-
-    ; vp8 specific skip_eob_node
-    cmp     r7, #0
-    movne   lr, #2                      ; i = 2
-    subne   r8, r8, #1                  ; --n
-
-    rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #88]              ; vp8_coef_tree
-
-    ; v is kept in r12 during the token pack loop
-    lsl     r12, r6, r4                ; r12 = v << 32 - n
-
-; loop start
-token_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
-    sub     r7, r5, #1                  ; range-1
-
-    ; Decisions are made based on the bit value shifted
-    ; off of v, so set a flag here based on this.
-    ; This value is refered to as "bb"
-    lsls    r12, r12, #1                ; bb = v >> n
-    mul     r6, r4, r7                  ; ((range-1) * pp[i>>1]))
-
-    ; bb can only be 0 or 1.  So only execute this statement
-    ; if bb == 1, otherwise it will act like i + 0
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start
-token_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-    ; r10 is used earlier in the loop, but r10 is used as
-    ; temp variable here.  So after r10 is used, reload
-    ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #88]              ; vp8_coef_tree
-
-token_count_lt_zero
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r8, r8, #1                  ; --n
-    bne     token_loop
-
-    ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #84]                ; vp8_extra_bits
-    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
-    ;  element.  Here vp8_extra_bit_struct == 16
-    add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
-
-    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
-    cmp     r4, #0
-    beq     skip_extra_bits
-
-;   if( b->base_val)
-    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
-    ldrsh   lr, [r1, #tokenextra_extra] ; e = p->Extra
-    cmp     r8, #0                      ; if( L)
-    beq     no_extra_bits
-
-    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
-    asr     r7, lr, #1                  ; v=e>>1
-
-    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
-    str     r10, [sp, #4]               ; b->tree
-
-    rsb     r4, r8, #32
-    lsl     r12, r7, r4
-
-    mov     lr, #0                      ; i = 0
-
-extra_bits_loop
-    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
-    sub     r7, r5, #1                  ; range-1
-    lsls    r12, r12, #1                ; v >> n
-    mul     r6, r4, r7                  ; (range-1) * pp[i>>1]
-    addcs   lr, lr, #1                  ; i + bb
-
-    mov     r7, #1
-    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
-    add     r4, r7, r6, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
-
-    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
-    subcs   r4, r5, r4                  ; if  (bb) range = range-split
-
-    clz     r6, r4
-    sub     r6, r6, #24
-
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     extra_count_lt_zero         ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset= shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     extra_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos - 1
-    b       extra_zero_while_start
-extra_zero_while_loop
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-extra_zero_while_start
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     extra_zero_while_loop
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]
-extra_high_bit_not_set
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
-    ldr     r10, [sp, #4]               ; b->tree
-extra_count_lt_zero
-    lsl     r2, r2, r6
-
-    subs    r8, r8, #1                  ; --n
-    bne     extra_bits_loop             ; while (n)
-
-no_extra_bits
-    ldr     lr, [r1, #4]                ; e = p->Extra
-    add     r4, r5, #1                  ; range + 1
-    tst     lr, #1
-    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
-    addne   r2, r2, r4                  ; lowvalue += split
-    subne   r4, r5, r4                  ; range = range-split
-    tst     r2, #0x80000000             ; lowvalue & 0x80000000
-    lsl     r5, r4, #1                  ; range <<= 1
-    beq     end_high_bit_not_set
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mov     r7, #0
-    sub     r4, r4, #1
-    b       end_zero_while_start
-end_zero_while_loop
-    strb    r7, [r6, r4]
-    sub     r4, r4, #1                  ; x--
-end_zero_while_start
-    cmp     r4, #0
-    ldrge   r6, [r0, #vp8_writer_buffer]
-    ldrb    r12, [r6, r4]
-    cmpge   r12, #0xff
-    beq     end_zero_while_loop
-
-    ldr     r6, [r0, #vp8_writer_buffer]
-    ldrb    r7, [r6, r4]
-    add     r7, r7, #1
-    strb    r7, [r6, r4]
-end_high_bit_not_set
-    adds    r3, r3, #1                  ; ++count
-    lsl     r2, r2, #1                  ; lowvalue  <<= 1
-    bne     end_count_zero
-
-    ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7                      ; count = -8
-    ldr     r7, [r0, #vp8_writer_buffer]
-    lsr     r6, r2, #24                 ; lowvalue >> 24
-    add     r12, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #vp8_writer_pos]
-
-    VALIDATE_POS r7, r12                ; validate_buffer at pos
-
-    strb    r6, [r7, r4]
-end_count_zero
-skip_extra_bits
-    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
-check_p_lt_stop
-    ldr     r4, [sp, #0]                ; stop
-    cmp     r1, r4                      ; while( p < stop)
-    bcc     while_p_lt_stop
-
-    ldr     r10, [sp, #20]              ; num_parts
-    mov     r1, #TOKENLIST_SZ
-    mul     r1, r10, r1
-
-    ldr     r6, [sp, #12]               ; mb_rows
-    ldr     r7, [sp, #16]               ; tokenlist address
-    subs    r6, r6, r10
-    add     r7, r7, r1                  ; next element in the array
-    str     r6, [sp, #12]
-    bgt     mb_row_loop
-
-end_partition
-    mov     r12, #32
-
-stop_encode_loop
-    sub     r7, r5, #1                  ; range-1
-
-    mov     r4, r7, lsl #7              ; ((range-1) * 128)
-
-    mov     r7, #1
-    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
-
-    ; Counting the leading zeros is used to normalize range.
-    clz     r6, r4
-    sub     r6, r6, #24                 ; shift
-
-    ; Flag is set on the sum of count.  This flag is used later
-    ; to determine if count >= 0
-    adds    r3, r3, r6                  ; count += shift
-    lsl     r5, r4, r6                  ; range <<= shift
-    bmi     token_count_lt_zero_se      ; if(count >= 0)
-
-    sub     r6, r6, r3                  ; offset = shift - count
-    sub     r4, r6, #1                  ; offset-1
-    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
-    bpl     token_high_bit_not_set_se
-
-    ldr     r4, [r0, #vp8_writer_pos]   ; x
-    sub     r4, r4, #1                  ; x = w->pos-1
-    b       token_zero_while_start_se
-token_zero_while_loop_se
-    mov     r10, #0
-    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
-    sub     r4, r4, #1                  ; x--
-token_zero_while_start_se
-    cmp     r4, #0
-    ldrge   r7, [r0, #vp8_writer_buffer]
-    ldrb    r11, [r7, r4]
-    cmpge   r11, #0xff
-    beq     token_zero_while_loop_se
-
-    ldr     r7, [r0, #vp8_writer_buffer]
-    ldrb    r10, [r7, r4]               ; w->buffer[x]
-    add     r10, r10, #1
-    strb    r10, [r7, r4]               ; w->buffer[x] + 1
-token_high_bit_not_set_se
-    rsb     r4, r6, #24                 ; 24-offset
-    ldr     r10, [r0, #vp8_writer_buffer]
-    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
-    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
-    lsl     r2, r2, r6                  ; lowvalue <<= offset
-    mov     r6, r3                      ; shift = count
-    add     r11, r4, #1                 ; w->pos++
-    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r11, [r0, #vp8_writer_pos]
-    sub     r3, r3, #8                  ; count -= 8
-
-    VALIDATE_POS r10, r11               ; validate_buffer at pos
-
-    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
-
-token_count_lt_zero_se
-    lsl     r2, r2, r6                  ; lowvalue <<= shift
-
-    subs    r12, r12, #1
-    bne     stop_encode_loop
-
-    ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
-    ldr     r12, [sp, #24]              ; ptr
-    add     r12, r12, r4                ; ptr += w->pos
-    str     r12, [sp, #24]
-
-    ldr     r11, [sp, #28]              ; i
-    ldr     r10, [sp, #20]              ; num_parts
-
-    add     r11, r11, #1                ; i++
-    str     r11, [sp, #28]
-
-    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
-    mov     r1, #TOKENLIST_SZ
-    add     r7, r7, r1                  ; next element in cpi->tp_list
-    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
-
-    cmp     r10, r11
-    bgt     numparts_loop
-
-    add     sp, sp, #40
-    pop     {r4-r12, pc}
-    ENDP
-
-_VP8_COMP_common_
-    DCD     vp8_comp_common
-_VP8_COMMON_MBrows_
-    DCD     vp8_common_mb_rows
-_VP8_COMP_tplist_
-    DCD     vp8_comp_tplist
-_VP8_COMP_bc_
-    DCD     vp8_comp_bc
-_vp8_writer_sz_
-    DCD     vp8_writer_sz
-
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
deleted file mode 100644
index de35a1e13ca..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_fast_quantize_b_armv6.asm
+++ /dev/null
@@ -1,225 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_quantize_b_armv6|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    BLOCK *b
-; r1    BLOCKD *d
-|vp8_fast_quantize_b_armv6| PROC
-    stmfd   sp!, {r1, r4-r11, lr}
-
-    ldr     r3, [r0, #vp8_block_coeff]      ; coeff
-    ldr     r4, [r0, #vp8_block_quant_fast] ; quant_fast
-    ldr     r5, [r0, #vp8_block_round]      ; round
-    ldr     r6, [r1, #vp8_blockd_qcoeff]    ; qcoeff
-    ldr     r7, [r1, #vp8_blockd_dqcoeff]   ; dqcoeff
-    ldr     r8, [r1, #vp8_blockd_dequant]   ; dequant
-
-    ldr     r2, loop_count          ; loop_count=0x1000000. 'lsls' instruction
-                                    ; is used to update the counter so that
-                                    ; it can be used to mark nonzero
-                                    ; quantized coefficient pairs.
-
-    mov     r1, #0                  ; flags for quantized coeffs
-
-    ; PART 1: quantization and dequantization loop
-loop
-    ldr     r9, [r3], #4            ; [z1 | z0]
-    ldr     r10, [r5], #4           ; [r1 | r0]
-    ldr     r11, [r4], #4           ; [q1 | q0]
-
-    ssat16  lr, #1, r9              ; [sz1 | sz0]
-    eor     r9, r9, lr              ; [z1 ^ sz1 | z0 ^ sz0]
-    ssub16  r9, r9, lr              ; x = (z ^ sz) - sz
-    sadd16  r9, r9, r10             ; [x1+r1 | x0+r0]
-
-    ldr     r12, [r3], #4           ; [z3 | z2]
-
-    smulbb  r0, r9, r11             ; [(x0+r0)*q0]
-    smultt  r9, r9, r11             ; [(x1+r1)*q1]
-
-    ldr     r10, [r5], #4           ; [r3 | r2]
-
-    ssat16  r11, #1, r12            ; [sz3 | sz2]
-    eor     r12, r12, r11           ; [z3 ^ sz3 | z2 ^ sz2]
-    pkhtb   r0, r9, r0, asr #16     ; [y1 | y0]
-    ldr     r9, [r4], #4            ; [q3 | q2]
-    ssub16  r12, r12, r11           ; x = (z ^ sz) - sz
-
-    sadd16  r12, r12, r10           ; [x3+r3 | x2+r2]
-
-    eor     r0, r0, lr              ; [(y1 ^ sz1) | (y0 ^ sz0)]
-
-    smulbb  r10, r12, r9            ; [(x2+r2)*q2]
-    smultt  r12, r12, r9            ; [(x3+r3)*q3]
-
-    ssub16  r0, r0, lr              ; x = (y ^ sz) - sz
-
-    cmp     r0, #0                  ; check if zero
-    orrne   r1, r1, r2, lsr #24     ; add flag for nonzero coeffs
-
-    str     r0, [r6], #4            ; *qcoeff++ = x
-    ldr     r9, [r8], #4            ; [dq1 | dq0]
-
-    pkhtb   r10, r12, r10, asr #16  ; [y3 | y2]
-    eor     r10, r10, r11           ; [(y3 ^ sz3) | (y2 ^ sz2)]
-    ssub16  r10, r10, r11           ; x = (y ^ sz) - sz
-
-    cmp     r10, #0                 ; check if zero
-    orrne   r1, r1, r2, lsr #23     ; add flag for nonzero coeffs
-
-    str     r10, [r6], #4           ; *qcoeff++ = x
-    ldr     r11, [r8], #4           ; [dq3 | dq2]
-
-    smulbb  r12, r0, r9             ; [x0*dq0]
-    smultt  r0, r0, r9              ; [x1*dq1]
-
-    smulbb  r9, r10, r11            ; [x2*dq2]
-    smultt  r10, r10, r11           ; [x3*dq3]
-
-    lsls    r2, r2, #2              ; update loop counter
-    strh    r12, [r7, #0]           ; dqcoeff[0] = [x0*dq0]
-    strh    r0, [r7, #2]            ; dqcoeff[1] = [x1*dq1]
-    strh    r9, [r7, #4]            ; dqcoeff[2] = [x2*dq2]
-    strh    r10, [r7, #6]           ; dqcoeff[3] = [x3*dq3]
-    add     r7, r7, #8              ; dqcoeff += 8
-    bne     loop
-
-    ; PART 2: check position for eob...
-    ldr     r11, [sp, #0]           ; restore BLOCKD pointer
-    mov     lr, #0                  ; init eob
-    cmp     r1, #0                  ; coeffs after quantization?
-    ldr     r12, [r11, #vp8_blockd_eob]
-    beq     end                     ; skip eob calculations if all zero
-
-    ldr     r0, [r11, #vp8_blockd_qcoeff]
-
-    ; check shortcut for nonzero qcoeffs
-    tst    r1, #0x80
-    bne    quant_coeff_15_14
-    tst    r1, #0x20
-    bne    quant_coeff_13_11
-    tst    r1, #0x8
-    bne    quant_coeff_12_7
-    tst    r1, #0x40
-    bne    quant_coeff_10_9
-    tst    r1, #0x10
-    bne    quant_coeff_8_3
-    tst    r1, #0x2
-    bne    quant_coeff_6_5
-    tst    r1, #0x4
-    bne    quant_coeff_4_2
-    b      quant_coeff_1_0
-
-quant_coeff_15_14
-    ldrh    r2, [r0, #30]       ; rc=15, i=15
-    mov     lr, #16
-    cmp     r2, #0
-    bne     end
-
-    ldrh    r3, [r0, #28]       ; rc=14, i=14
-    mov     lr, #15
-    cmp     r3, #0
-    bne     end
-
-quant_coeff_13_11
-    ldrh    r2, [r0, #22]       ; rc=11, i=13
-    mov     lr, #14
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_12_7
-    ldrh    r3, [r0, #14]       ; rc=7,  i=12
-    mov     lr, #13
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #20]       ; rc=10, i=11
-    mov     lr, #12
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_10_9
-    ldrh    r3, [r0, #26]       ; rc=13, i=10
-    mov     lr, #11
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #24]       ; rc=12, i=9
-    mov     lr, #10
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_8_3
-    ldrh    r3, [r0, #18]       ; rc=9,  i=8
-    mov     lr, #9
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #12]       ; rc=6,  i=7
-    mov     lr, #8
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_6_5
-    ldrh    r3, [r0, #6]        ; rc=3,  i=6
-    mov     lr, #7
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #4]        ; rc=2,  i=5
-    mov     lr, #6
-    cmp     r2, #0
-    bne     end
-
-quant_coeff_4_2
-    ldrh    r3, [r0, #10]       ; rc=5,  i=4
-    mov     lr, #5
-    cmp     r3, #0
-    bne     end
-
-    ldrh    r2, [r0, #16]       ; rc=8,  i=3
-    mov     lr, #4
-    cmp     r2, #0
-    bne     end
-
-    ldrh    r3, [r0, #8]        ; rc=4,  i=2
-    mov     lr, #3
-    cmp     r3, #0
-    bne     end
-
-quant_coeff_1_0
-    ldrh    r2, [r0, #2]        ; rc=1,  i=1
-    mov     lr, #2
-    cmp     r2, #0
-    bne     end
-
-    mov     lr, #1              ; rc=0,  i=0
-
-end
-    strb    lr, [r12]
-    ldmfd   sp!, {r1, r4-r11, pc}
-
-    ENDP
-
-loop_count
-    DCD     0x1000000
-
-    END
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
deleted file mode 100644
index 05746cf7fe9..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_subtract_mby_armv6|
-    EXPORT  |vp8_subtract_mbuv_armv6|
-    EXPORT  |vp8_subtract_b_armv6|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    BLOCK *be
-; r1    BLOCKD *bd
-; r2    int pitch
-|vp8_subtract_b_armv6| PROC
-
-    stmfd   sp!, {r4-r9}
-
-    ldr     r4, [r0, #vp8_block_base_src]
-    ldr     r5, [r0, #vp8_block_src]
-    ldr     r6, [r0, #vp8_block_src_diff]
-
-    ldr     r3, [r4]
-    ldr     r7, [r0, #vp8_block_src_stride]
-    add     r3, r3, r5          ; src = *base_src + src
-    ldr     r8, [r1, #vp8_blockd_predictor]
-
-    mov     r9, #4              ; loop count
-
-loop_block
-
-    ldr     r0, [r3], r7        ; src
-    ldr     r1, [r8], r2        ; pred
-
-    uxtb16  r4, r0              ; [s2 | s0]
-    uxtb16  r5, r1              ; [p2 | p0]
-    uxtb16  r0, r0, ror #8      ; [s3 | s1]
-    uxtb16  r1, r1, ror #8      ; [p3 | p1]
-
-    usub16  r4, r4, r5          ; [d2 | d0]
-    usub16  r5, r0, r1          ; [d3 | d1]
-
-    subs    r9, r9, #1          ; decrement loop counter
-
-    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
-    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
-
-    str     r0, [r6, #0]        ; diff
-    str     r1, [r6, #4]        ; diff
-
-    add     r6, r6, r2, lsl #1  ; update diff pointer
-    bne     loop_block
-
-    ldmfd   sp!, {r4-r9}
-    mov     pc, lr
-
-    ENDP
-
-
-; r0    short *diff
-; r1    unsigned char *usrc
-; r2    unsigned char *vsrc
-; r3    int src_stride
-; sp    unsigned char *upred
-; sp    unsigned char *vpred
-; sp    int pred_stride
-|vp8_subtract_mbuv_armv6| PROC
-
-    stmfd   sp!, {r4-r11}
-
-    add     r0, r0, #512        ; set *diff point to Cb
-    mov     r4, #8              ; loop count
-    ldr     r5, [sp, #32]       ; upred
-    ldr     r12, [sp, #40]      ; pred_stride
-
-    ; Subtract U block
-loop_u
-    ldr     r6, [r1]            ; usrc      (A)
-    ldr     r7, [r5]            ; upred     (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r1, #4]       ; usrc      (B)
-    ldr     r11, [r5, #4]       ; upred     (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    add     r1, r1, r3          ; update usrc pointer
-    add     r5, r5, r12         ; update upred pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (B)
-
-    bne     loop_u
-
-    ldr     r5, [sp, #36]       ; vpred
-    mov     r4, #8              ; loop count
-
-    ; Subtract V block
-loop_v
-    ldr     r6, [r2]            ; vsrc      (A)
-    ldr     r7, [r5]            ; vpred     (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r2, #4]       ; vsrc      (B)
-    ldr     r11, [r5, #4]       ; vpred     (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    add     r2, r2, r3          ; update vsrc pointer
-    add     r5, r5, r12         ; update vpred pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (B)
-
-    bne     loop_v
-
-    ldmfd   sp!, {r4-r11}
-    bx      lr
-
-    ENDP
-
-
-; r0    short *diff
-; r1    unsigned char *src
-; r2    int src_stride
-; r3    unsigned char *pred
-; sp    int pred_stride
-|vp8_subtract_mby_armv6| PROC
-
-    stmfd   sp!, {r4-r11}
-    ldr     r12, [sp, #32]      ; pred_stride
-    mov     r4, #16
-loop
-    ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r3]            ; pred      (A)
-
-    uxtb16  r8, r6              ; [s2 | s0] (A)
-    uxtb16  r9, r7              ; [p2 | p0] (A)
-    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
-    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (A)
-    usub16  r7, r10, r11        ; [d3 | d1] (A)
-
-    ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r3, #4]       ; pred      (B)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
-
-    str     r8, [r0], #4        ; diff      (A)
-    uxtb16  r8, r10             ; [s2 | s0] (B)
-    str     r9, [r0], #4        ; diff      (A)
-
-    uxtb16  r9, r11             ; [p2 | p0] (B)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (B)
-    usub16  r7, r10, r11        ; [d3 | d1] (B)
-
-    ldr     r10, [r1, #8]       ; src       (C)
-    ldr     r11, [r3, #8]       ; pred      (C)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
-
-    str     r8, [r0], #4        ; diff      (B)
-    uxtb16  r8, r10             ; [s2 | s0] (C)
-    str     r9, [r0], #4        ; diff      (B)
-
-    uxtb16  r9, r11             ; [p2 | p0] (C)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (C)
-    usub16  r7, r10, r11        ; [d3 | d1] (C)
-
-    ldr     r10, [r1, #12]      ; src       (D)
-    ldr     r11, [r3, #12]      ; pred      (D)
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (C)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (C)
-
-    str     r8, [r0], #4        ; diff      (C)
-    uxtb16  r8, r10             ; [s2 | s0] (D)
-    str     r9, [r0], #4        ; diff      (C)
-
-    uxtb16  r9, r11             ; [p2 | p0] (D)
-    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
-    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
-
-    usub16  r6, r8, r9          ; [d2 | d0] (D)
-    usub16  r7, r10, r11        ; [d3 | d1] (D)
-
-    add     r1, r1, r2          ; update src pointer
-    add     r3, r3, r12         ; update pred pointer
-
-    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
-    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
-
-    str     r8, [r0], #4        ; diff      (D)
-    subs    r4, r4, #1          ; update loop counter
-    str     r9, [r0], #4        ; diff      (D)
-
-    bne     loop
-
-    ldmfd   sp!, {r4-r11}
-    bx      lr
-
-    ENDP
-
-    END
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/boolhuff_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/boolhuff_arm.c
deleted file mode 100644
index 17a941bfc65..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/boolhuff_arm.c
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/boolhuff.h"
-#include "vpx/internal/vpx_codec_internal.h"
-
-const unsigned int vp8_prob_cost[256] =
-{
-    2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
-    1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
-    767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-    617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
-    511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
-    428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-    361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
-    304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
-    255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-    211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
-    172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
-    137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-    105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
-    75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
-    48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-    22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
-};
-
-int vp8_validate_buffer_arm(const unsigned char *start,
-                            size_t               len,
-                            const unsigned char *end,
-                            struct vpx_internal_error_info *error)
-{
-    return validate_buffer(start, len, end, error);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
deleted file mode 100644
index 9374310e58f..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.asm
+++ /dev/null
@@ -1,258 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_fast_quantize_b_neon|
-    EXPORT  |vp8_fast_quantize_b_pair_neon|
-
-    INCLUDE vp8_asm_enc_offsets.asm
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=4
-
-;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
-|vp8_fast_quantize_b_pair_neon| PROC
-
-    stmfd           sp!, {r4-r9}
-    vstmdb          sp!, {q4-q7}
-
-    ldr             r4, [r0, #vp8_block_coeff]
-    ldr             r5, [r0, #vp8_block_quant_fast]
-    ldr             r6, [r0, #vp8_block_round]
-
-    vld1.16         {q0, q1}, [r4@128]  ; load z
-
-    ldr             r7, [r2, #vp8_blockd_qcoeff]
-
-    vabs.s16        q4, q0              ; calculate x = abs(z)
-    vabs.s16        q5, q1
-
-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
-    vshr.s16        q2, q0, #15         ; sz
-    vshr.s16        q3, q1, #15
-
-    vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
-    vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
-
-    ldr             r4, [r1, #vp8_block_coeff]
-
-    vadd.s16        q4, q6              ; x + Round
-    vadd.s16        q5, q7
-
-    vld1.16         {q0, q1}, [r4@128]  ; load z2
-
-    vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q5, q9
-
-    vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
-    vabs.s16        q11, q1
-    vshr.s16        q12, q0, #15        ; sz2
-    vshr.s16        q13, q1, #15
-
-    ;modify data to have its original sign
-    veor.s16        q4, q2              ; y^sz
-    veor.s16        q5, q3
-
-    vadd.s16        q10, q6             ; x2 + Round
-    vadd.s16        q11, q7
-
-    ldr             r8, [r2, #vp8_blockd_dequant]
-
-    vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q11, q9
-
-    vshr.s16        q4, #1              ; right shift 1 after vqdmulh
-    vshr.s16        q5, #1
-
-    vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
-
-    vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q5, q3
-
-    vshr.s16        q10, #1             ; right shift 1 after vqdmulh
-    vshr.s16        q11, #1
-
-    ldr             r9, [r2, #vp8_blockd_dqcoeff]
-
-    veor.s16        q10, q12            ; y2^sz2
-    veor.s16        q11, q13
-
-    vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
-
-
-    vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q11, q13
-
-    ldr             r6, [r3, #vp8_blockd_qcoeff]
-
-    vmul.s16        q2, q6, q4          ; x * Dequant
-    vmul.s16        q3, q7, q5
-
-    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
-
-    vceq.s16        q8, q8              ; set q8 to all 1
-
-    vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
-
-    vmul.s16        q12, q6, q10        ; x2 * Dequant
-    vmul.s16        q13, q7, q11
-
-    vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
-
-    vtst.16         q14, q4, q8         ; now find eob
-    vtst.16         q15, q5, q8         ; non-zero element is set to all 1
-
-    vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
-
-    ldr             r7, [r3, #vp8_blockd_dqcoeff]
-
-    vand            q0, q6, q14         ; get all valid numbers from scan array
-    vand            q1, q7, q15
-
-    vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
-
-    vtst.16         q2, q10, q8         ; now find eob
-    vtst.16         q3, q11, q8         ; non-zero element is set to all 1
-
-    vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
-
-    vand            q10, q6, q2         ; get all valid numbers from scan array
-    vand            q11, q7, q3
-    vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
-
-    vmax.u16        d0, d0, d1
-    vmax.u16        d20, d20, d21
-    vmovl.u16       q0, d0
-    vmovl.u16       q10, d20
-
-    vmax.u32        d0, d0, d1
-    vmax.u32        d20, d20, d21
-    vpmax.u32       d0, d0, d0
-    vpmax.u32       d20, d20, d20
-
-    ldr             r4, [r2, #vp8_blockd_eob]
-    ldr             r5, [r3, #vp8_blockd_eob]
-
-    vst1.8          {d0[0]}, [r4]       ; store eob
-    vst1.8          {d20[0]}, [r5]      ; store eob
-
-    vldmia          sp!, {q4-q7}
-    ldmfd           sp!, {r4-r9}
-    bx              lr
-
-    ENDP
-
-;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-|vp8_fast_quantize_b_neon| PROC
-
-    stmfd           sp!, {r4-r7}
-
-    ldr             r3, [r0, #vp8_block_coeff]
-    ldr             r4, [r0, #vp8_block_quant_fast]
-    ldr             r5, [r0, #vp8_block_round]
-
-    vld1.16         {q0, q1}, [r3@128]  ; load z
-    vorr.s16        q14, q0, q1         ; check if all zero (step 1)
-    ldr             r6, [r1, #vp8_blockd_qcoeff]
-    ldr             r7, [r1, #vp8_blockd_dqcoeff]
-    vorr.s16        d28, d28, d29       ; check if all zero (step 2)
-
-    vabs.s16        q12, q0             ; calculate x = abs(z)
-    vabs.s16        q13, q1
-
-    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
-    vshr.s16        q2, q0, #15         ; sz
-    vmov            r2, r3, d28         ; check if all zero (step 3)
-    vshr.s16        q3, q1, #15
-
-    vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
-    vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
-
-    vadd.s16        q12, q14            ; x + Round
-    vadd.s16        q13, q15
-
-    adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
-
-    vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
-    vqdmulh.s16     q13, q9
-
-    vld1.16         {q10, q11}, [r0@128]; load inverse scan order
-
-    vceq.s16        q8, q8              ; set q8 to all 1
-
-    ldr             r4, [r1, #vp8_blockd_dequant]
-
-    vshr.s16        q12, #1             ; right shift 1 after vqdmulh
-    vshr.s16        q13, #1
-
-    ldr             r5, [r1, #vp8_blockd_eob]
-
-    orr             r2, r2, r3          ; check if all zero (step 4)
-    cmp             r2, #0              ; check if all zero (step 5)
-    beq             zero_output         ; check if all zero (step 6)
-
-    ;modify data to have its original sign
-    veor.s16        q12, q2             ; y^sz
-    veor.s16        q13, q3
-
-    vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
-    vsub.s16        q13, q3
-
-    vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
-
-    vtst.16         q14, q12, q8        ; now find eob
-    vtst.16         q15, q13, q8        ; non-zero element is set to all 1
-
-    vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
-
-    vand            q10, q10, q14       ; get all valid numbers from scan array
-    vand            q11, q11, q15
-
-
-    vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
-    vmax.u16        d0, d0, d1
-    vmovl.u16       q0, d0
-
-    vmul.s16        q2, q12             ; x * Dequant
-    vmul.s16        q3, q13
-
-    vmax.u32        d0, d0, d1
-    vpmax.u32       d0, d0, d0
-
-    vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
-
-    vst1.8          {d0[0]}, [r5]       ; store eob
-
-    ldmfd           sp!, {r4-r7}
-    bx              lr
-
-zero_output
-    strb            r2, [r5]            ; store eob
-    vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
-    vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
-
-    ldmfd           sp!, {r4-r7}
-    bx              lr
-
-    ENDP
-
-; default inverse zigzag table is defined in vp8/common/entropy.c
-    ALIGN 16    ; enable use of @128 bit aligned loads
-inv_zig_zag
-    DCW 0x0001, 0x0002, 0x0006, 0x0007
-    DCW 0x0003, 0x0005, 0x0008, 0x000d
-    DCW 0x0004, 0x0009, 0x000c, 0x000e
-    DCW 0x000a, 0x000b, 0x000f, 0x0010
-
-    END
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
new file mode 100644
index 00000000000..e5824bfb217
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -0,0 +1,89 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp8/encoder/block.h"
+
+static const uint16_t inv_zig_zag[16] = {
+    1,  2,  6,   7,
+    3,  5,  8,  13,
+    4,  9,  12, 14,
+    10, 11, 15, 16
+};
+
+void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
+    const int16x8_t one_q = vdupq_n_s16(-1),
+                    z0 = vld1q_s16(b->coeff),
+                    z1 = vld1q_s16(b->coeff + 8),
+                    round0 = vld1q_s16(b->round),
+                    round1 = vld1q_s16(b->round + 8),
+                    quant0 = vld1q_s16(b->quant_fast),
+                    quant1 = vld1q_s16(b->quant_fast + 8),
+                    dequant0 = vld1q_s16(d->dequant),
+                    dequant1 = vld1q_s16(d->dequant + 8);
+    const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
+                     zig_zag1 = vld1q_u16(inv_zig_zag + 8);
+    int16x8_t x0, x1, sz0, sz1, y0, y1;
+    uint16x8_t eob0, eob1;
+    uint16x4_t eob_d16;
+    uint32x2_t eob_d32;
+    uint32x4_t eob_q32;
+
+    /* sign of z: z >> 15 */
+    sz0 = vshrq_n_s16(z0, 15);
+    sz1 = vshrq_n_s16(z1, 15);
+
+    /* x = abs(z) */
+    x0 = vabsq_s16(z0);
+    x1 = vabsq_s16(z1);
+
+    /* x += round */
+    x0 = vaddq_s16(x0, round0);
+    x1 = vaddq_s16(x1, round1);
+
+    /* y = 2 * (x * quant) >> 16 */
+    y0 = vqdmulhq_s16(x0, quant0);
+    y1 = vqdmulhq_s16(x1, quant1);
+
+    /* Compensate for doubling in vqdmulhq */
+    y0 = vshrq_n_s16(y0, 1);
+    y1 = vshrq_n_s16(y1, 1);
+
+    /* Restore sign bit */
+    y0 = veorq_s16(y0, sz0);
+    y1 = veorq_s16(y1, sz1);
+    x0 = vsubq_s16(y0, sz0);
+    x1 = vsubq_s16(y1, sz1);
+
+    /* find non-zero elements */
+    eob0 = vtstq_s16(x0, one_q);
+    eob1 = vtstq_s16(x1, one_q);
+
+    /* mask zig zag */
+    eob0 = vandq_u16(eob0, zig_zag0);
+    eob1 = vandq_u16(eob1, zig_zag1);
+
+    /* select the largest value */
+    eob0 = vmaxq_u16(eob0, eob1);
+    eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
+    eob_q32 = vmovl_u16(eob_d16);
+    eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
+    eob_d32 = vpmax_u32(eob_d32, eob_d32);
+
+    /* qcoeff = x */
+    vst1q_s16(d->qcoeff, x0);
+    vst1q_s16(d->qcoeff + 8, x1);
+
+    /* dqcoeff = x * dequant */
+    vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
+    vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
+
+    vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c
deleted file mode 100644
index 80d9ad05414..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/arm/quantize_arm.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/encoder/block.h"
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/entropy.h"
-
-
-#if HAVE_NEON
-
-/* vp8_quantize_mbX functions here differs from corresponding ones in
- * quantize.c only by using quantize_b_pair function pointer instead of
- * the regular quantize_b function pointer */
-void vp8_quantize_mby_neon(MACROBLOCK *x)
-{
-    int i;
-    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-    for (i = 0; i < 16; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
-    if(has_2nd_order)
-        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp8_quantize_mb_neon(MACROBLOCK *x)
-{
-    int i;
-    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-    for (i = 0; i < 24; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
-    if (has_2nd_order)
-        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-
-void vp8_quantize_mbuv_neon(MACROBLOCK *x)
-{
-    int i;
-
-    for (i = 16; i < 24; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-}
-
-#endif /* HAVE_NEON */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c
index 9d0e69cf44c..ea279b32181 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.c
@@ -159,7 +159,7 @@ static void write_split(vp8_writer *bc, int x)
     );
 }
 
-void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount)
 {
     const TOKENEXTRA *stop = p + xcount;
     unsigned int split;
@@ -374,7 +374,7 @@ static void write_partition_size(unsigned char *cx_data, int size)
 
 }
 
-static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+static void pack_tokens_into_partitions(VP8_COMP *cpi, unsigned char *cx_data,
                                           unsigned char * cx_data_end,
                                           int num_part)
 {
@@ -398,7 +398,7 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
             const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
             int tokens = (int)(stop - p);
 
-            vp8_pack_tokens_c(w, p, tokens);
+            vp8_pack_tokens(w, p, tokens);
         }
 
         vp8_stop_encode(w);
@@ -407,7 +407,7 @@ static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
 }
 
 
-static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+static void pack_mb_row_tokens(VP8_COMP *cpi, vp8_writer *w)
 {
     int mb_row;
 
@@ -417,7 +417,7 @@ static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
         const TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
         int tokens = (int)(stop - p);
 
-        vp8_pack_tokens_c(w, p, tokens);
+        vp8_pack_tokens(w, p, tokens);
     }
 
 }
@@ -1543,7 +1543,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
     if (pc->refresh_entropy_probs == 0)
     {
         /* save a copy for later refresh */
-        vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+        memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
     }
 
     vp8_update_coef_probs(cpi);
@@ -1620,7 +1620,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
             /* concatenate partition buffers */
             for(i = 0; i < num_part; i++)
             {
-                vpx_memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
+                memmove(dp, cpi->partition_d[i+1], cpi->partition_sz[i+1]);
                 cpi->partition_d[i+1] = dp;
                 dp += cpi->partition_sz[i+1];
             }
@@ -1676,7 +1676,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned char * dest
             pack_mb_row_tokens(cpi, &cpi->bc[1]);
         else
 #endif
-            pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
+            vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
 
         vp8_stop_encode(&cpi->bc[1]);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
index 66f4bf67ee4..de69805513c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/bitstream.h
@@ -16,36 +16,7 @@
 extern "C" {
 #endif
 
-#if HAVE_EDSP
-void vp8cx_pack_tokens_armv5(vp8_writer *w, const TOKENEXTRA *p, int xcount,
-                             vp8_token *,
-                             const vp8_extra_bit_struct *,
-                             const vp8_tree_index *);
-void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
-                                             unsigned char * cx_data,
-                                             const unsigned char *cx_data_end,
-                                             int num_parts,
-                                             vp8_token *,
-                                             const vp8_extra_bit_struct *,
-                                             const vp8_tree_index *);
-void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
-                                    vp8_token *,
-                                    const vp8_extra_bit_struct *,
-                                    const vp8_tree_index *);
-# define pack_tokens(a,b,c)                  \
-    vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_tokens_into_partitions(a,b,c,d)  \
-    vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_mb_row_tokens(a,b)               \
-    vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-#else
-
-void vp8_pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount);
-
-# define pack_tokens(a,b,c)                    vp8_pack_tokens_c(a,b,c)
-# define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
-# define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
-#endif
+void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
index 1f212cae8be..248e79549bf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/block.h
@@ -125,6 +125,8 @@ typedef struct macroblock
 
     int optimize;
     int q_index;
+    int is_skin;
+    int denoise_zeromv;
 
 #if CONFIG_TEMPORAL_DENOISING
     int increase_denoising;
@@ -160,8 +162,9 @@ typedef struct macroblock
     void (*short_fdct8x4)(short *input, short *output, int pitch);
     void (*short_walsh4x4)(short *input, short *output, int pitch);
     void (*quantize_b)(BLOCK *b, BLOCKD *d);
-    void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
 
+    unsigned int mbs_zero_last_dot_suppress;
+    int zero_last_dot_suppress;
 } MACROBLOCK;
 
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
index 75b2a3be456..d197f8f8166 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.c
@@ -374,7 +374,7 @@ void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
   } else if (mode == 3) {
     denoiser->denoiser_mode = kDenoiserOnYUVAggressive;
   } else {
-    denoiser->denoiser_mode = kDenoiserOnAdaptive;
+    denoiser->denoiser_mode = kDenoiserOnYUV;
   }
   if (denoiser->denoiser_mode != kDenoiserOnYUVAggressive) {
     denoiser->denoise_pars.scale_sse_thresh = 1;
@@ -391,9 +391,9 @@ void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser, int mode) {
     denoiser->denoise_pars.scale_increase_filter = 1;
     denoiser->denoise_pars.denoise_mv_bias = 60;
     denoiser->denoise_pars.pickmode_mv_bias = 75;
-    denoiser->denoise_pars.qp_thresh = 85;
+    denoiser->denoise_pars.qp_thresh = 80;
     denoiser->denoise_pars.consec_zerolast = 15;
-    denoiser->denoise_pars.spatial_blur = 20;
+    denoiser->denoise_pars.spatial_blur = 0;
   }
 }
 
@@ -415,8 +415,8 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
             vp8_denoiser_free(denoiser);
             return 1;
         }
-        vpx_memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
-                   denoiser->yv12_running_avg[i].frame_size);
+        memset(denoiser->yv12_running_avg[i].buffer_alloc, 0,
+               denoiser->yv12_running_avg[i].frame_size);
 
     }
     denoiser->yv12_mc_running_avg.flags = 0;
@@ -428,19 +428,19 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
         return 1;
     }
 
-    vpx_memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
-               denoiser->yv12_mc_running_avg.frame_size);
+    memset(denoiser->yv12_mc_running_avg.buffer_alloc, 0,
+           denoiser->yv12_mc_running_avg.frame_size);
 
     if (vp8_yv12_alloc_frame_buffer(&denoiser->yv12_last_source, width,
                                     height, VP8BORDERINPIXELS) < 0) {
       vp8_denoiser_free(denoiser);
       return 1;
     }
-    vpx_memset(denoiser->yv12_last_source.buffer_alloc, 0,
-               denoiser->yv12_last_source.frame_size);
+    memset(denoiser->yv12_last_source.buffer_alloc, 0,
+           denoiser->yv12_last_source.frame_size);
 
     denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
-    vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
+    memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
     vp8_denoiser_set_parameters(denoiser, mode);
     denoiser->nmse_source_diff = 0;
     denoiser->nmse_source_diff_count = 0;
@@ -453,16 +453,16 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
     // Bitrate thresholds and noise metric (nmse) thresholds for switching to
     // aggressive mode.
     // TODO(marpan): Adjust thresholds, including effect on resolution.
-    denoiser->bitrate_threshold = 300000;  // (bits/sec).
-    denoiser->threshold_aggressive_mode = 35;
+    denoiser->bitrate_threshold = 400000;  // (bits/sec).
+    denoiser->threshold_aggressive_mode = 80;
     if (width * height > 1280 * 720) {
-      denoiser->bitrate_threshold = 2000000;
-      denoiser->threshold_aggressive_mode = 1400;
+      denoiser->bitrate_threshold = 3000000;
+      denoiser->threshold_aggressive_mode = 200;
     } else if (width * height > 960 * 540) {
-      denoiser->bitrate_threshold = 800000;
-      denoiser->threshold_aggressive_mode = 150;
+      denoiser->bitrate_threshold = 1200000;
+      denoiser->threshold_aggressive_mode = 120;
     } else if (width * height > 640 * 480) {
-      denoiser->bitrate_threshold = 500000;
+      denoiser->bitrate_threshold = 600000;
       denoiser->threshold_aggressive_mode = 100;
     }
     return 0;
@@ -483,7 +483,6 @@ void vp8_denoiser_free(VP8_DENOISER *denoiser)
     vpx_free(denoiser->denoise_state);
 }
 
-
 void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
                              MACROBLOCK *x,
                              unsigned int best_sse,
@@ -554,6 +553,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
              * Note that any changes to the mode info only affects the
              * denoising.
              */
+            x->denoise_zeromv = 1;
             mbmi->ref_frame =
                     x->best_zeromv_reference_frame;
 
@@ -603,6 +603,12 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
     motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
         NOISE_MOTION_THRESHOLD;
 
+    // If block is considered to be skin area, lower the motion threshold.
+    // In current version set threshold = 1, so only denoise very low
+    // (i.e., zero) mv on skin.
+    if (x->is_skin)
+        motion_threshold = 1;
+
     if (motion_magnitude2 <
         denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
       x->increase_denoising = 1;
@@ -662,6 +668,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
         /* No filtering of this block; it differs too much from the predictor,
          * or the motion vector magnitude is considered too big.
          */
+        x->denoise_zeromv = 0;
         vp8_copy_mem16x16(
                 x->thismb, 16,
                 denoiser->yv12_running_avg[INTRA_FRAME].y_buffer + recon_yoffset,
@@ -692,7 +699,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
       int uv_stride =denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
 
       // Fix filter level to some nominal value for now.
-      int filter_level = 32;
+      int filter_level = 48;
 
       int hev_index = lfi_n->hev_thr_lut[INTER_FRAME][filter_level];
       lfi.mblim = lfi_n->mblim[filter_level];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
index 6c1f9e22baa..9a379a6a168 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/denoising.h
@@ -19,7 +19,7 @@ extern "C" {
 #endif
 
 #define SUM_DIFF_THRESHOLD (16 * 16 * 2)
-#define SUM_DIFF_THRESHOLD_HIGH (600)
+#define SUM_DIFF_THRESHOLD_HIGH (600)  // ~(16 * 16 * 1.5)
 #define MOTION_MAGNITUDE_THRESHOLD (8*3)
 
 #define SUM_DIFF_THRESHOLD_UV (96)   // (8 * 8 * 1.5)
@@ -27,7 +27,7 @@ extern "C" {
 #define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 8)
 #define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
 
-#define MAX_GF_ARF_DENOISE_RANGE (16)
+#define MAX_GF_ARF_DENOISE_RANGE (8)
 
 enum vp8_denoiser_decision
 {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
index aec6b9880c1..378e902c6a4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c
@@ -82,6 +82,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
 {
     unsigned int act;
     unsigned int sse;
+    (void)cpi;
     /* TODO: This could also be done over smaller areas (8x8), but that would
      *  require extensive changes elsewhere, as lambda is assumed to be fixed
      *  over an entire MB in most of the code.
@@ -154,8 +155,8 @@ static void calc_av_activity( VP8_COMP *cpi, int64_t activity_sum )
                         cpi->common.MBs));
 
         /* Copy map to sort list */
-        vpx_memcpy( sortlist, cpi->mb_activity_map,
-                    sizeof(unsigned int) * cpi->common.MBs );
+        memcpy( sortlist, cpi->mb_activity_map,
+                sizeof(unsigned int) * cpi->common.MBs );
 
 
         /* Ripple each value down to its correct position */
@@ -522,7 +523,8 @@ void encode_mb_row(VP8_COMP *cpi,
             }
 
 #endif
-            // Keep track of how many (consecutive) times a block is coded
+
+            // Keep track of how many (consecutive) times a  block is coded
             // as ZEROMV_LASTREF, for base layer frames.
             // Reset to 0 if its coded as anything else.
             if (cpi->current_layer == 0) {
@@ -531,9 +533,14 @@ void encode_mb_row(VP8_COMP *cpi,
                 // Increment, check for wrap-around.
                 if (cpi->consec_zero_last[map_index+mb_col] < 255)
                   cpi->consec_zero_last[map_index+mb_col] += 1;
+                if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+                  cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
               } else {
                 cpi->consec_zero_last[map_index+mb_col] = 0;
+                cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
               }
+              if (x->zero_last_dot_suppress)
+                cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
             }
 
             /* Special case code for cyclic refresh
@@ -574,7 +581,7 @@ void encode_mb_row(VP8_COMP *cpi,
         /* pack tokens for this MB */
         {
             int tok_count = *tp - tp_start;
-            pack_tokens(w, tp_start, tok_count);
+            vp8_pack_tokens(w, tp_start, tok_count);
         }
 #endif
         /* Increment pointer into gf usage flags structure. */
@@ -658,8 +665,7 @@ static void init_encode_frame_mb_context(VP8_COMP *cpi)
 
     x->mvc = cm->fc.mvc;
 
-    vpx_memset(cm->above_context, 0,
-               sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+    memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
 
     /* Special case treatment when GF and ARF are not sensible options
      * for reference
@@ -737,7 +743,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
     const int num_part = (1 << cm->multi_token_partition);
 #endif
 
-    vpx_memset(segment_counts, 0, sizeof(segment_counts));
+    memset(segment_counts, 0, sizeof(segment_counts));
     totalrate = 0;
 
     if (cpi->compressor_speed == 2)
@@ -967,7 +973,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
         int i;
 
         /* Set to defaults */
-        vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+        memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
 
         tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
 
@@ -1143,6 +1149,8 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
         while (++b < 16);
     }
 
+#else
+    (void)cpi;
 #endif
 
     ++x->ymode_count[m];
@@ -1252,7 +1260,6 @@ int vp8cx_encode_inter_macroblock
         if(cpi->sf.use_fastquant_for_pick)
         {
             x->quantize_b      = vp8_fast_quantize_b;
-            x->quantize_b_pair = vp8_fast_quantize_b_pair;
 
             /* the fast quantizer does not use zbin_extra, so
              * do not recalculate */
@@ -1265,7 +1272,6 @@ int vp8cx_encode_inter_macroblock
         if (cpi->sf.improved_quant)
         {
             x->quantize_b      = vp8_regular_quantize_b;
-            x->quantize_b_pair = vp8_regular_quantize_b_pair;
         }
 
         /* restore cpi->zbin_mode_boost_enabled */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c
index eb0619d9597..dfd0a237a5f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodemb.c
@@ -506,8 +506,8 @@ static void optimize_mb(MACROBLOCK *x)
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -555,8 +555,8 @@ void vp8_optimize_mby(MACROBLOCK *x)
     if (!x->e_mbd.left_context)
         return;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -595,8 +595,8 @@ void vp8_optimize_mbuv(MACROBLOCK *x)
     if (!x->e_mbd.left_context)
         return;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
index 7b8b51f308b..977b0b0321e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c
@@ -215,11 +215,15 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                                   LAST_FRAME) {
                             // Increment, check for wrap-around.
                             if (cpi->consec_zero_last[map_index+mb_col] < 255)
-                              cpi->consec_zero_last[map_index+mb_col] +=
-                                  1;
+                              cpi->consec_zero_last[map_index+mb_col] += 1;
+                            if (cpi->consec_zero_last_mvbias[map_index+mb_col] < 255)
+                              cpi->consec_zero_last_mvbias[map_index+mb_col] += 1;
                           } else {
                             cpi->consec_zero_last[map_index+mb_col] = 0;
+                            cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
                           }
+                          if (x->zero_last_dot_suppress)
+                            cpi->consec_zero_last_mvbias[map_index+mb_col] = 0;
                         }
 
                         /* Special case code for cyclic refresh
@@ -261,7 +265,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                     /* pack tokens for this MB */
                     {
                         int tok_count = tp - tp_start;
-                        pack_tokens(w, tp_start, tok_count);
+                        vp8_pack_tokens(w, tp_start, tok_count);
                     }
 #else
                     cpi->tplist[mb_row].stop = tp;
@@ -346,7 +350,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
     z->short_fdct8x4     = x->short_fdct8x4;
     z->short_walsh4x4    = x->short_walsh4x4;
     z->quantize_b        = x->quantize_b;
-    z->quantize_b_pair   = x->quantize_b_pair;
     z->optimize          = x->optimize;
 
     /*
@@ -413,14 +416,13 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
         zd->subpixel_predict16x16    = xd->subpixel_predict16x16;
         zd->segmentation_enabled     = xd->segmentation_enabled;
         zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
-        vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data,
-                   sizeof(xd->segment_feature_data));
+        memcpy(zd->segment_feature_data, xd->segment_feature_data,
+               sizeof(xd->segment_feature_data));
 
-        vpx_memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc,
-                   sizeof(xd->dequant_y1_dc));
-        vpx_memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
-        vpx_memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
-        vpx_memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
+        memcpy(zd->dequant_y1_dc, xd->dequant_y1_dc, sizeof(xd->dequant_y1_dc));
+        memcpy(zd->dequant_y1, xd->dequant_y1, sizeof(xd->dequant_y1));
+        memcpy(zd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
+        memcpy(zd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
 #if 1
         /*TODO:  Remove dequant from BLOCKD.  This is a temporary solution until
@@ -435,15 +437,14 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
 #endif
 
 
-        vpx_memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
-        vpx_memcpy(z->rd_thresh_mult, x->rd_thresh_mult,
-                   sizeof(x->rd_thresh_mult));
+        memcpy(z->rd_threshes, x->rd_threshes, sizeof(x->rd_threshes));
+        memcpy(z->rd_thresh_mult, x->rd_thresh_mult, sizeof(x->rd_thresh_mult));
 
         z->zbin_over_quant = x->zbin_over_quant;
         z->zbin_mode_boost_enabled = x->zbin_mode_boost_enabled;
         z->zbin_mode_boost = x->zbin_mode_boost;
 
-        vpx_memset(z->error_bins, 0, sizeof(z->error_bins));
+        memset(z->error_bins, 0, sizeof(z->error_bins));
     }
 }
 
@@ -469,7 +470,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
         mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
         mb->gf_active_ptr            = x->gf_active_ptr;
 
-        vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+        memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
         mbr_ei[i].totalrate = 0;
 
         mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1);
@@ -506,6 +507,7 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
         mb->intra_error = 0;
         vp8_zero(mb->count_mb_ref_frame_usage);
         mb->mbs_tested_so_far = 0;
+        mb->mbs_zero_last_dot_suppress = 0;
     }
 }
 
@@ -543,7 +545,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi)
                         vpx_malloc(sizeof(sem_t) * th_count));
         CHECK_MEM_ERROR(cpi->mb_row_ei,
                         vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
-        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
         CHECK_MEM_ERROR(cpi->en_thread_data,
                         vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
index 98e5a7115db..75c1362610f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/firstpass.c
@@ -132,6 +132,7 @@ static void output_stats(const VP8_COMP            *cpi,
                          FIRSTPASS_STATS            *stats)
 {
     struct vpx_codec_cx_pkt pkt;
+    (void)cpi;
     pkt.kind = VPX_CODEC_STATS_PKT;
     pkt.data.twopass_stats.buf = stats;
     pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
@@ -418,6 +419,7 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
     int raw_stride = raw_buffer->y_stride;
     unsigned char *ref_ptr;
     int ref_stride = x->e_mbd.pre.y_stride;
+    (void)cpi;
 
     /* Set up pointers for this macro block raw buffer */
     raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
@@ -571,7 +573,7 @@ void vp8_first_pass(VP8_COMP *cpi)
     {
         int flag[2] = {1, 1};
         vp8_initialize_rd_consts(cpi, x, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
-        vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+        memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
     }
 
@@ -1409,6 +1411,7 @@ void vp8_init_second_pass(VP8_COMP *cpi)
 
 void vp8_end_second_pass(VP8_COMP *cpi)
 {
+  (void)cpi;
 }
 
 /* This function gives and estimate of how badly we believe the prediction
@@ -1419,6 +1422,7 @@ static double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_fra
     double prediction_decay_rate;
     double motion_decay;
     double motion_pct = next_frame->pcnt_motion;
+    (void)cpi;
 
     /* Initial basis is the % mbs inter coded */
     prediction_decay_rate = next_frame->pcnt_inter;
@@ -1547,6 +1551,7 @@ static void accumulate_frame_motion_stats(
     double this_frame_mvr_ratio;
     double this_frame_mvc_ratio;
     double motion_pct;
+    (void)cpi;
 
     /* Accumulate motion stats. */
     motion_pct = this_frame->pcnt_motion;
@@ -1774,7 +1779,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
     start_pos = cpi->twopass.stats_in;
 
-    vpx_memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
+    memset(&next_frame, 0, sizeof(next_frame)); /* assure clean */
 
     /* Load stats for the current frame. */
     mod_frame_err = calculate_modified_err(cpi, this_frame);
@@ -1870,7 +1875,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             break;
         }
 
-        vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+        memcpy(this_frame, &next_frame, sizeof(*this_frame));
 
         old_boost_score = boost_score;
     }
@@ -2440,7 +2445,7 @@ void vp8_second_pass(VP8_COMP *cpi)
     if (cpi->twopass.frames_to_key == 0)
     {
         /* Define next KF group and assign bits to it */
-        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
         find_next_key_frame(cpi, &this_frame_copy);
 
         /* Special case: Error error_resilient_mode mode does not make much
@@ -2466,7 +2471,7 @@ void vp8_second_pass(VP8_COMP *cpi)
     if (cpi->frames_till_gf_update_due == 0)
     {
         /* Define next gf group and assign bits to it */
-        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
         define_gf_group(cpi, &this_frame_copy);
 
         /* If we are going to code an altref frame at the end of the group
@@ -2482,7 +2487,7 @@ void vp8_second_pass(VP8_COMP *cpi)
              * to the GF group
              */
             int bak = cpi->per_frame_bandwidth;
-            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
             assign_std_frame_bits(cpi, &this_frame_copy);
             cpi->per_frame_bandwidth = bak;
         }
@@ -2505,14 +2510,14 @@ void vp8_second_pass(VP8_COMP *cpi)
             if (cpi->common.frame_type != KEY_FRAME)
             {
                 /* Assign bits from those allocated to the GF group */
-                vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+                memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
                 assign_std_frame_bits(cpi, &this_frame_copy);
             }
         }
         else
         {
             /* Assign bits from those allocated to the GF group */
-            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
             assign_std_frame_bits(cpi, &this_frame_copy);
         }
     }
@@ -2653,7 +2658,7 @@ static int test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTP
         double decay_accumulator = 1.0;
         double next_iiratio;
 
-        vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+        memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
 
         /* Note the starting file position so we can reset to it */
         start_pos = cpi->twopass.stats_in;
@@ -2730,7 +2735,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     double kf_group_coded_err = 0.0;
     double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
 
-    vpx_memset(&next_frame, 0, sizeof(next_frame));
+    memset(&next_frame, 0, sizeof(next_frame));
 
     vp8_clear_system_state();
     start_position = cpi->twopass.stats_in;
@@ -2751,7 +2756,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     cpi->twopass.frames_to_key = 1;
 
     /* Take a copy of the initial frame details */
-    vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+    memcpy(&first_frame, this_frame, sizeof(*this_frame));
 
     cpi->twopass.kf_group_bits = 0;
     cpi->twopass.kf_group_error_left = 0;
@@ -2774,7 +2779,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         kf_group_coded_err += this_frame->coded_error;
 
         /* Load the next frame's stats. */
-        vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+        memcpy(&last_frame, this_frame, sizeof(*this_frame));
         input_stats(cpi, this_frame);
 
         /* Provided that we are not at the end of the file... */
@@ -2842,7 +2847,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         cpi->twopass.frames_to_key /= 2;
 
         /* Copy first frame details */
-        vpx_memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
+        memcpy(&tmp_frame, &first_frame, sizeof(first_frame));
 
         /* Reset to the start of the group */
         reset_fpf_position(cpi, start_position);
@@ -2964,7 +2969,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
      */
     decay_accumulator = 1.0;
     boost_score = 0.0;
-    loop_decay_rate = 1.00;       /* Starting decay rate */
 
     for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
     {
@@ -3208,7 +3212,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         int new_width = cpi->oxcf.Width;
         int new_height = cpi->oxcf.Height;
 
-        int projected_buffer_level = (int)cpi->buffer_level;
+        int projected_buffer_level;
         int tmp_q;
 
         double projected_bits_perframe;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
index 545f2c89321..f848e8fb571 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/mcomp.c
@@ -9,6 +9,8 @@
  */
 
 
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "onyx_int.h"
 #include "mcomp.h"
 #include "vpx_mem/vpx_mem.h"
@@ -888,6 +890,8 @@ int vp8_hex_search
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
+    (void)mvcost;
+
     /* adjust ref_mv to make sure it is within MV range */
     vp8_clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
     br = ref_mv->as_mv.row;
@@ -898,7 +902,7 @@ int vp8_hex_search
     this_offset = base_offset + (br * (pre_stride)) + bc;
     this_mv.as_mv.row = br;
     this_mv.as_mv.col = bc;
-    bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride, UINT_MAX)
+    bestsad = vfp->sdf(what, what_stride, this_offset, in_what_stride)
             + mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
 #if CONFIG_MULTI_RES_ENCODING
@@ -911,6 +915,8 @@ int vp8_hex_search
     else if (search_param >= 1) hex_range = 63;
 
     dia_range = 8;
+#else
+    (void)search_param;
 #endif
 
     /* hex search */
@@ -923,7 +929,7 @@ int vp8_hex_search
             this_mv.as_mv.row = br + hex[i].row;
             this_mv.as_mv.col = bc + hex[i].col;
             this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
             CHECK_BETTER
         }
     }else
@@ -934,7 +940,7 @@ int vp8_hex_search
             this_mv.as_mv.col = bc + hex[i].col;
             CHECK_POINT
             this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) + this_mv.as_mv.col;
-            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+            thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
             CHECK_BETTER
         }
     }
@@ -960,7 +966,7 @@ int vp8_hex_search
                 this_mv.as_mv.row = br + next_chkpts[k][i].row;
                 this_mv.as_mv.col = bc + next_chkpts[k][i].col;
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }else
@@ -971,7 +977,7 @@ int vp8_hex_search
                 this_mv.as_mv.col = bc + next_chkpts[k][i].col;
                 CHECK_POINT
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }
@@ -1002,7 +1008,7 @@ cal_neighbors:
                 this_mv.as_mv.row = br + neighbors[i].row;
                 this_mv.as_mv.col = bc + neighbors[i].col;
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }else
@@ -1013,7 +1019,7 @@ cal_neighbors:
                 this_mv.as_mv.col = bc + neighbors[i].col;
                 CHECK_POINT
                 this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) + this_mv.as_mv.col;
-                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride, bestsad);
+                thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
                 CHECK_BETTER
             }
         }
@@ -1097,7 +1103,7 @@ int vp8_diamond_search_sad_c
     best_address = in_what;
 
     /* Check the starting position */
-    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* search_param determines the length of the initial step and hence
@@ -1122,7 +1128,7 @@ int vp8_diamond_search_sad_c
 
             {
                 check_here = ss[i].offset + best_address;
-                thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+                thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
                 if (thissad < bestsad)
                 {
@@ -1221,7 +1227,7 @@ int vp8_diamond_search_sadx4
     best_address = in_what;
 
     /* Check the starting position */
-    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* search_param determines the length of the initial step and hence the
@@ -1289,7 +1295,7 @@ int vp8_diamond_search_sadx4
                 (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
                 {
                     check_here = ss[i].offset + best_address;
-                    thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+                    thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
                     if (thissad < bestsad)
                     {
@@ -1372,8 +1378,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     best_mv->as_mv.col = ref_col;
 
     /* Baseline value at the centre */
-    bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* Apply further limits to prevent us looking using vectors that
@@ -1398,7 +1403,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 
         for (c = col_min; c < col_max; c++)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
             this_mv.as_mv.col = c;
             thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
@@ -1470,8 +1475,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     best_mv->as_mv.col = ref_col;
 
     /* Baseline value at the centre */
-    bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* Apply further limits to prevent us looking using vectors that stretch
@@ -1527,7 +1531,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 
         while (c < col_max)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
             if (thissad < bestsad)
             {
@@ -1586,7 +1590,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     int col_min = ref_col - distance;
     int col_max = ref_col + distance;
 
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
+    // TODO(johannkoenig): check if this alignment is necessary.
+    DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
     unsigned int sad_array[3];
 
     int *mvsadcost[2];
@@ -1605,8 +1610,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     best_mv->as_mv.col = ref_col;
 
     /* Baseline value at the centre */
-    bestsad = fn_ptr->sdf(what, what_stride,
-                          bestaddress, in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride)
             + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
     /* Apply further limits to prevent us looking using vectors that stretch
@@ -1692,7 +1696,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
 
         while (c < col_max)
         {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
 
             if (thissad < bestsad)
             {
@@ -1750,8 +1754,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
             + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
 
     for (i=0; i<search_range; i++)
@@ -1767,7 +1770,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv
             (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
             {
                 check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
-                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
 
                 if (thissad < bestsad)
                 {
@@ -1830,8 +1833,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
     fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-    bestsad = fn_ptr->sdf(what, what_stride, best_address,
-                          in_what_stride, UINT_MAX)
+    bestsad = fn_ptr->sdf(what, what_stride, best_address, in_what_stride)
             + mvsad_err_cost(ref_mv, &fcenter_mv, mvsadcost, error_per_bit);
 
     for (i=0; i<search_range; i++)
@@ -1882,7 +1884,7 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                 (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
                 {
                     check_here = (neighbors[j].row)*in_what_stride + neighbors[j].col + best_address;
-                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+                    thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride);
 
                     if (thissad < bestsad)
                     {
@@ -1974,8 +1976,8 @@ void print_mode_context(void)
 #ifdef VP8_ENTROPY_STATS
 void init_mv_ref_counts()
 {
-    vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-    vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+    memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+    memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
 }
 
 void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
index d8eff669ed5..5b452312ed2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c
@@ -11,6 +11,8 @@
 
 #include "vpx_config.h"
 #include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vp8_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/blockd.h"
 #include "onyx_int.h"
@@ -427,10 +429,10 @@ static void setup_features(VP8_COMP *cpi)
 
     cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
-    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    memset(cpi->mb.e_mbd.last_ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.last_mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
 
     set_default_lf_deltas(cpi);
 
@@ -507,7 +509,7 @@ static void disable_segmentation(VP8_COMP *cpi)
 static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
 {
     /* Copy in the new segmentation map */
-    vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+    memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
 
     /* Signal that the map should be updated. */
     cpi->mb.e_mbd.update_mb_segmentation_map = 1;
@@ -529,7 +531,7 @@ static void set_segmentation_map(VP8_COMP *cpi, unsigned char *segmentation_map)
 static void set_segment_data(VP8_COMP *cpi, signed char *feature_data, unsigned char abs_delta)
 {
     cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
-    vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+    memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
 }
 
 
@@ -579,11 +581,31 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
 
     cpi->cyclic_refresh_q = Q / 2;
 
+    if (cpi->oxcf.screen_content_mode) {
+      // Modify quality ramp-up based on Q. Above some Q level, increase the
+      // number of blocks to be refreshed, and reduce it below the thredhold.
+      // Turn-off under certain conditions (i.e., away from key frame, and if
+      // we are at good quality (low Q) and most of the blocks were skipped-encoded
+      // in previous frame.
+      if (Q >= 100) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 10;
+      } else if (cpi->frames_since_key > 250 &&
+                 Q < 20 &&
+                 cpi->mb.skip_true_count > (int)(0.95 * mbs_in_frame)) {
+        cpi->cyclic_refresh_mode_max_mbs_perframe = 0;
+      } else {
+        cpi->cyclic_refresh_mode_max_mbs_perframe =
+            (cpi->common.mb_rows * cpi->common.mb_cols) / 20;
+      }
+      block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+    }
+
     // Set every macroblock to be eligible for update.
     // For key frame this will reset seg map to 0.
-    vpx_memset(cpi->segmentation_map, 0, mbs_in_frame);
+    memset(cpi->segmentation_map, 0, mbs_in_frame);
 
-    if (cpi->common.frame_type != KEY_FRAME)
+    if (cpi->common.frame_type != KEY_FRAME && block_count > 0)
     {
         /* Cycle through the macro_block rows */
         /* MB loop to set local segmentation map */
@@ -617,15 +639,18 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
 #if CONFIG_TEMPORAL_DENOISING
         if (cpi->oxcf.noise_sensitivity > 0) {
           if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive &&
-              Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
+              Q < (int)cpi->denoiser.denoise_pars.qp_thresh &&
+              (cpi->frames_since_key >
+               2 * cpi->denoiser.denoise_pars.consec_zerolast)) {
             // Under aggressive denoising, use segmentation to turn off loop
-            // filter below some qp thresh. The filter is turned off for all
+            // filter below some qp thresh. The filter is reduced for all
             // blocks that have been encoded as ZEROMV LAST x frames in a row,
             // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
             // This is to avoid "dot" artifacts that can occur from repeated
             // loop filtering on noisy input source.
             cpi->cyclic_refresh_q = Q;
-            lf_adjustment = -MAX_LOOP_FILTER;
+            // lf_adjustment = -MAX_LOOP_FILTER;
+            lf_adjustment = -40;
             for (i = 0; i < mbs_in_frame; ++i) {
               seg_map[i] = (cpi->consec_zero_last[i] >
                             cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
@@ -662,8 +687,8 @@ static void set_default_lf_deltas(VP8_COMP *cpi)
     cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
     cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
 
-    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
-    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+    memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
 
     /* Test of ref frame deltas */
     cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
@@ -786,6 +811,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     }
 
     cpi->mb.mbs_tested_so_far = 0;
+    cpi->mb.mbs_zero_last_dot_suppress = 0;
 
     /* best quality defaults */
     sf->RD = 1;
@@ -853,6 +879,25 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     sf->thresh_mult[THR_SPLIT2] =
     sf->thresh_mult[THR_SPLIT3] = speed_map(Speed, thresh_mult_map_split2);
 
+    // Special case for temporal layers.
+    // Reduce the thresholds for zero/nearest/near for GOLDEN, if GOLDEN is
+    // used as second reference. We don't modify thresholds for ALTREF case
+    // since ALTREF is usually used as long-term reference in temporal layers.
+    if ((cpi->Speed <= 6) &&
+        (cpi->oxcf.number_of_layers > 1) &&
+        (cpi->ref_frame_flags & VP8_LAST_FRAME) &&
+        (cpi->ref_frame_flags & VP8_GOLD_FRAME)) {
+      if (cpi->closest_reference_frame == GOLDEN_FRAME) {
+        sf->thresh_mult[THR_ZERO2] =  sf->thresh_mult[THR_ZERO2] >> 3;
+        sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 3;
+        sf->thresh_mult[THR_NEAR2]  = sf->thresh_mult[THR_NEAR2] >> 3;
+      } else {
+        sf->thresh_mult[THR_ZERO2] =  sf->thresh_mult[THR_ZERO2] >> 1;
+        sf->thresh_mult[THR_NEAREST2] = sf->thresh_mult[THR_NEAREST2] >> 1;
+        sf->thresh_mult[THR_NEAR2]  = sf->thresh_mult[THR_NEAR2] >> 1;
+      }
+    }
+
     cpi->mode_check_freq[THR_ZERO1] =
     cpi->mode_check_freq[THR_NEAREST1] =
     cpi->mode_check_freq[THR_NEAR1] =
@@ -1043,7 +1088,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
         if (Speed >= 15)
             sf->half_pixel_search = 0;
 
-        vpx_memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
+        memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
 
     }; /* switch */
 
@@ -1083,12 +1128,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     if (cpi->sf.improved_quant)
     {
         cpi->mb.quantize_b      = vp8_regular_quantize_b;
-        cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
     }
     else
     {
         cpi->mb.quantize_b      = vp8_fast_quantize_b;
-        cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
     }
     if (cpi->sf.improved_quant != last_improved_quant)
         vp8cx_init_quantizer(cpi);
@@ -1256,7 +1299,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
     CHECK_MEM_ERROR(cpi->active_map,
                     vpx_calloc(cm->mb_rows * cm->mb_cols,
                     sizeof(*cpi->active_map)));
-    vpx_memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
+    memset(cpi->active_map , 1, (cm->mb_rows * cm->mb_cols));
 
 #if CONFIG_MULTITHREAD
     if (width < 640)
@@ -1363,20 +1406,31 @@ static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
 
-    /* frame rate is not available on the first frame, as it's derived from
+    /* Frame rate is not available on the first frame, as it's derived from
      * the observed timestamps. The actual value used here doesn't matter
-     * too much, as it will adapt quickly. If the reciprocal of the timebase
-     * seems like a reasonable framerate, then use that as a guess, otherwise
-     * use 30.
+     * too much, as it will adapt quickly.
      */
-    cpi->framerate = (double)(oxcf->timebase.den) /
-                     (double)(oxcf->timebase.num);
+    if (oxcf->timebase.num > 0) {
+      cpi->framerate = (double)(oxcf->timebase.den) /
+                       (double)(oxcf->timebase.num);
+    } else {
+      cpi->framerate = 30;
+    }
 
+    /* If the reciprocal of the timebase seems like a reasonable framerate,
+     * then use that as a guess, otherwise use 30.
+     */
     if (cpi->framerate > 180)
         cpi->framerate = 30;
 
     cpi->ref_framerate = cpi->framerate;
 
+    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
+
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+    cm->refresh_entropy_probs = 1;
+
     /* change includes all joint functionality */
     vp8_change_config(cpi, oxcf);
 
@@ -1597,12 +1651,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     cpi->baseline_gf_interval =
         cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
-    cpi->ref_frame_flags = VP8_ALTR_FRAME | VP8_GOLD_FRAME | VP8_LAST_FRAME;
-
-    cm->refresh_golden_frame = 0;
-    cm->refresh_last_frame = 1;
-    cm->refresh_entropy_probs = 1;
-
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
     cpi->oxcf.token_partitions = 3;
 #endif
@@ -1705,13 +1753,25 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
     if (cpi->oxcf.number_of_layers != prev_number_of_layers)
     {
         // If the number of temporal layers are changed we must start at the
-        // base of the pattern cycle, so reset temporal_pattern_counter.
+        // base of the pattern cycle, so set the layer id to 0 and reset
+        // the temporal pattern counter.
+        if (cpi->temporal_layer_id > 0) {
+          cpi->temporal_layer_id = 0;
+        }
         cpi->temporal_pattern_counter = 0;
         reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
     }
 
+    if (!cpi->initial_width)
+    {
+        cpi->initial_width = cpi->oxcf.Width;
+        cpi->initial_height = cpi->oxcf.Height;
+    }
+
     cm->Width       = cpi->oxcf.Width;
     cm->Height      = cpi->oxcf.Height;
+    assert(cm->Width <= cpi->initial_width);
+    assert(cm->Height <= cpi->initial_height);
 
     /* TODO(jkoleszar): if an internal spatial resampling is active,
      * and we downsize the input image, maybe we should clear the
@@ -1832,7 +1892,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
 
     cm = &cpi->common;
 
-    vpx_memset(cpi, 0, sizeof(VP8_COMP));
+    memset(cpi, 0, sizeof(VP8_COMP));
 
     if (setjmp(cm->error.jmp))
     {
@@ -1852,6 +1912,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
     cpi->common.current_video_frame   = 0;
     cpi->temporal_pattern_counter     = 0;
+    cpi->temporal_layer_id            = -1;
     cpi->kf_overspend_bits            = 0;
     cpi->kf_bitrate_adjustment        = 0;
     cpi->frames_till_gf_update_due      = 0;
@@ -1904,6 +1965,8 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     }
 #endif
 
+    cpi->mse_source_denoised = 0;
+
     /* Should we use the cyclic refresh method.
      * Currently this is tied to error resilliant mode
      */
@@ -1927,7 +1990,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
         cpi->cyclic_refresh_map = (signed char *) NULL;
 
     CHECK_MEM_ERROR(cpi->consec_zero_last,
-                    vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+                    vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
+    CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+                    vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
 #ifdef VP8_ENTROPY_STATS
     init_context_counters();
@@ -2062,55 +2127,55 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
     }
 #endif
 
-    cpi->fn_ptr[BLOCK_16X16].sdf            = vp8_sad16x16;
+    cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
     cpi->fn_ptr[BLOCK_16X16].vf             = vp8_variance16x16;
     cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
     cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
     cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
     cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
-    cpi->fn_ptr[BLOCK_16X16].sdx3f          = vp8_sad16x16x3;
-    cpi->fn_ptr[BLOCK_16X16].sdx8f          = vp8_sad16x16x8;
-    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vp8_sad16x16x4d;
+    cpi->fn_ptr[BLOCK_16X16].sdx3f          = vpx_sad16x16x3;
+    cpi->fn_ptr[BLOCK_16X16].sdx8f          = vpx_sad16x16x8;
+    cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;
 
-    cpi->fn_ptr[BLOCK_16X8].sdf            = vp8_sad16x8;
+    cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
     cpi->fn_ptr[BLOCK_16X8].vf             = vp8_variance16x8;
     cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_16X8].sdx3f          = vp8_sad16x8x3;
-    cpi->fn_ptr[BLOCK_16X8].sdx8f          = vp8_sad16x8x8;
-    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vp8_sad16x8x4d;
+    cpi->fn_ptr[BLOCK_16X8].sdx3f          = vpx_sad16x8x3;
+    cpi->fn_ptr[BLOCK_16X8].sdx8f          = vpx_sad16x8x8;
+    cpi->fn_ptr[BLOCK_16X8].sdx4df         = vpx_sad16x8x4d;
 
-    cpi->fn_ptr[BLOCK_8X16].sdf            = vp8_sad8x16;
+    cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
     cpi->fn_ptr[BLOCK_8X16].vf             = vp8_variance8x16;
     cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_8X16].sdx3f          = vp8_sad8x16x3;
-    cpi->fn_ptr[BLOCK_8X16].sdx8f          = vp8_sad8x16x8;
-    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vp8_sad8x16x4d;
+    cpi->fn_ptr[BLOCK_8X16].sdx3f          = vpx_sad8x16x3;
+    cpi->fn_ptr[BLOCK_8X16].sdx8f          = vpx_sad8x16x8;
+    cpi->fn_ptr[BLOCK_8X16].sdx4df         = vpx_sad8x16x4d;
 
-    cpi->fn_ptr[BLOCK_8X8].sdf            = vp8_sad8x8;
+    cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
     cpi->fn_ptr[BLOCK_8X8].vf             = vp8_variance8x8;
     cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_8X8].sdx3f          = vp8_sad8x8x3;
-    cpi->fn_ptr[BLOCK_8X8].sdx8f          = vp8_sad8x8x8;
-    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vp8_sad8x8x4d;
+    cpi->fn_ptr[BLOCK_8X8].sdx3f          = vpx_sad8x8x3;
+    cpi->fn_ptr[BLOCK_8X8].sdx8f          = vpx_sad8x8x8;
+    cpi->fn_ptr[BLOCK_8X8].sdx4df         = vpx_sad8x8x4d;
 
-    cpi->fn_ptr[BLOCK_4X4].sdf            = vp8_sad4x4;
+    cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
     cpi->fn_ptr[BLOCK_4X4].vf             = vp8_variance4x4;
     cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
-    cpi->fn_ptr[BLOCK_4X4].sdx3f          = vp8_sad4x4x3;
-    cpi->fn_ptr[BLOCK_4X4].sdx8f          = vp8_sad4x4x8;
-    cpi->fn_ptr[BLOCK_4X4].sdx4df         = vp8_sad4x4x4d;
+    cpi->fn_ptr[BLOCK_4X4].sdx3f          = vpx_sad4x4x3;
+    cpi->fn_ptr[BLOCK_4X4].sdx8f          = vpx_sad4x4x8;
+    cpi->fn_ptr[BLOCK_4X4].sdx4df         = vpx_sad4x4x4d;
 
 #if ARCH_X86 || ARCH_X86_64
     cpi->fn_ptr[BLOCK_16X16].copymem      = vp8_copy32xn;
@@ -2206,9 +2271,6 @@ void vp8_remove_compressor(VP8_COMP **ptr)
 
             if (cpi->b_calculate_psnr)
             {
-                YV12_BUFFER_CONFIG *lst_yv12 =
-                              &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
-
                 if (cpi->oxcf.number_of_layers > 1)
                 {
                     int i;
@@ -2220,7 +2282,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
                         double dr = (double)cpi->bytes_in_layer[i] *
                                               8.0 / 1000.0  / time_encoded;
                         double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
-                                         lst_yv12->y_width * lst_yv12->y_height;
+                                         cpi->common.Width * cpi->common.Height;
                         double total_psnr =
                             vpx_sse_to_psnr(samples, 255.0,
                                             cpi->total_error2[i]);
@@ -2242,7 +2304,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
                 else
                 {
                     double samples = 3.0 / 2 * cpi->count *
-                                        lst_yv12->y_width * lst_yv12->y_height;
+                                     cpi->common.Width * cpi->common.Height;
                     double total_psnr = vpx_sse_to_psnr(samples, 255.0,
                                                         cpi->total_sq_error);
                     double total_psnr2 = vpx_sse_to_psnr(samples, 255.0,
@@ -2450,6 +2512,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
     vpx_free(cpi->tok);
     vpx_free(cpi->cyclic_refresh_map);
     vpx_free(cpi->consec_zero_last);
+    vpx_free(cpi->consec_zero_last_mvbias);
 
     vp8_remove_common(&cpi->common);
     vpx_free(cpi);
@@ -2805,7 +2868,7 @@ static void update_alt_ref_frame_stats(VP8_COMP *cpi)
     }
 
     /* Update data structure that monitors level of reference to last GF */
-    vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+    memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
     /* this frame refreshes means next frames don't unless specified by user */
@@ -2854,7 +2917,7 @@ static void update_golden_frame_stats(VP8_COMP *cpi)
         }
 
         /* Update data structure that monitors level of reference to last GF */
-        vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
         cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
         /* this frame refreshes means next frames don't unless specified by
@@ -3293,6 +3356,49 @@ static void update_reference_frames(VP8_COMP *cpi)
 
 }
 
+static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
+                                       YV12_BUFFER_CONFIG *dest,
+                                       VP8_COMP *cpi)
+    {
+        int i, j;
+        int Total = 0;
+        int num_blocks = 0;
+        int skip = 2;
+        int min_consec_zero_last = 10;
+        int tot_num_blocks = (source->y_height * source->y_width) >> 8;
+        unsigned char *src = source->y_buffer;
+        unsigned char *dst = dest->y_buffer;
+
+        /* Loop through the Y plane, every |skip| blocks along rows and colmumns,
+         * summing the square differences, and only for blocks that have been
+         * zero_last mode at least |x| frames in a row.
+         */
+        for (i = 0; i < source->y_height; i += 16 * skip)
+        {
+            int block_index_row = (i >> 4) * cpi->common.mb_cols;
+            for (j = 0; j < source->y_width; j += 16 * skip)
+            {
+                int index = block_index_row + (j >> 4);
+                if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
+                  unsigned int sse;
+                  Total += vp8_mse16x16(src + j,
+                                        source->y_stride,
+                                        dst + j, dest->y_stride,
+                                        &sse);
+                  num_blocks++;
+                }
+            }
+            src += 16 * skip * source->y_stride;
+            dst += 16 * skip * dest->y_stride;
+        }
+        // Only return non-zero if we have at least ~1/16 samples for estimate.
+        if (num_blocks > (tot_num_blocks >> 4)) {
+        return (Total / num_blocks);
+        } else {
+          return 0;
+        }
+    }
+
 #if CONFIG_TEMPORAL_DENOISING
 static void process_denoiser_mode_change(VP8_COMP *cpi) {
   const VP8_COMMON *const cm = &cpi->common;
@@ -3305,12 +3411,12 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
   // Only select blocks for computing nmse that have been encoded
   // as ZERO LAST min_consec_zero_last frames in a row.
   // Scale with number of temporal layers.
-  int min_consec_zero_last = 8 / cpi->oxcf.number_of_layers;
+  int min_consec_zero_last = 12 / cpi->oxcf.number_of_layers;
   // Decision is tested for changing the denoising mode every
   // num_mode_change times this function is called. Note that this
   // function called every 8 frames, so (8 * num_mode_change) is number
   // of frames where denoising mode change is tested for switch.
-  int num_mode_change = 15;
+  int num_mode_change = 20;
   // Framerate factor, to compensate for larger mse at lower framerates.
   // Use ref_framerate, which is full source framerate for temporal layers.
   // TODO(marpan): Adjust this factor.
@@ -3322,7 +3428,12 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
   static const unsigned char const_source[16] = {
       128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
       128, 128, 128};
-
+  int bandwidth = (int)(cpi->target_bandwidth);
+  // For temporal layers, use full bandwidth (top layer).
+  if (cpi->oxcf.number_of_layers > 1) {
+    LAYER_CONTEXT *lc = &cpi->layer_context[cpi->oxcf.number_of_layers - 1];
+    bandwidth = (int)(lc->target_bandwidth);
+  }
   // Loop through the Y plane, every skip blocks along rows and columns,
   // summing the normalized mean square error, only for blocks that have
   // been encoded as ZEROMV LAST at least min_consec_zero_last least frames in
@@ -3334,11 +3445,6 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
       int index = block_index_row + (j >> 4);
       if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
         unsigned int sse;
-        const unsigned int mse = vp8_mse16x16(src + j,
-                                              ystride,
-                                              dst + j,
-                                              ystride,
-                                              &sse);
         const unsigned int var = vp8_variance16x16(src + j,
                                                    ystride,
                                                    dst + j,
@@ -3347,14 +3453,15 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
         // Only consider this block as valid for noise measurement
         // if the sum_diff average of the current and previous frame
         // is small (to avoid effects from lighting change).
-        if ((mse - var) < 256) {
+        if ((sse - var) < 128) {
+          unsigned int sse2;
           const unsigned int act = vp8_variance16x16(src + j,
                                                      ystride,
                                                      const_source,
                                                      0,
-                                                     &sse);
+                                                     &sse2);
           if (act > 0)
-            total += mse / act;
+            total += sse / act;
           num_blocks++;
         }
       }
@@ -3370,16 +3477,17 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
   if (total > 0 &&
       (num_blocks > (tot_num_blocks >> 4))) {
     // Update the recursive mean square source_diff.
+    total = (total << 8) / num_blocks;
     if (cpi->denoiser.nmse_source_diff_count == 0) {
       // First sample in new interval.
       cpi->denoiser.nmse_source_diff = total;
       cpi->denoiser.qp_avg = cm->base_qindex;
     } else {
       // For subsequent samples, use average with weight ~1/4 for new sample.
-      cpi->denoiser.nmse_source_diff = (int)((total >> 2) +
-          3 * (cpi->denoiser.nmse_source_diff >> 2));
-      cpi->denoiser.qp_avg = (int)((cm->base_qindex >> 2) +
-          3 * (cpi->denoiser.qp_avg >> 2));
+      cpi->denoiser.nmse_source_diff = (int)((total +
+          3 * cpi->denoiser.nmse_source_diff) >> 2);
+      cpi->denoiser.qp_avg = (int)((cm->base_qindex +
+          3 * cpi->denoiser.qp_avg) >> 2);
     }
     cpi->denoiser.nmse_source_diff_count++;
   }
@@ -3391,7 +3499,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
         (cpi->denoiser.nmse_source_diff >
         cpi->denoiser.threshold_aggressive_mode) &&
         (cpi->denoiser.qp_avg < cpi->denoiser.qp_threshold_up &&
-         cpi->target_bandwidth > cpi->denoiser.bitrate_threshold)) {
+         bandwidth > cpi->denoiser.bitrate_threshold)) {
       vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUVAggressive);
     } else {
       // Check for going down: from aggressive to normal mode.
@@ -3400,7 +3508,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
           cpi->denoiser.threshold_aggressive_mode)) ||
           ((cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive) &&
           (cpi->denoiser.qp_avg > cpi->denoiser.qp_threshold_down ||
-           cpi->target_bandwidth < cpi->denoiser.bitrate_threshold))) {
+           bandwidth < cpi->denoiser.bitrate_threshold))) {
         vp8_denoiser_set_parameters(&cpi->denoiser, kDenoiserOnYUV);
       }
     }
@@ -3416,6 +3524,13 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
 {
     const FRAME_TYPE frame_type = cm->frame_type;
 
+    int update_any_ref_buffers = 1;
+    if (cpi->common.refresh_last_frame == 0 &&
+        cpi->common.refresh_golden_frame == 0 &&
+        cpi->common.refresh_alt_ref_frame == 0) {
+        update_any_ref_buffers = 0;
+    }
+
     if (cm->no_lpf)
     {
         cm->filter_level = 0;
@@ -3427,11 +3542,36 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
         vp8_clear_system_state();
 
         vpx_usec_timer_start(&timer);
-        if (cpi->sf.auto_filter == 0)
+        if (cpi->sf.auto_filter == 0) {
+#if CONFIG_TEMPORAL_DENOISING
+            if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+                // Use the denoised buffer for selecting base loop filter level.
+                // Denoised signal for current frame is stored in INTRA_FRAME.
+                // No denoising on key frames.
+                vp8cx_pick_filter_level_fast(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+            } else {
+                vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+            }
+#else
             vp8cx_pick_filter_level_fast(cpi->Source, cpi);
-
-        else
+#endif
+        } else {
+#if CONFIG_TEMPORAL_DENOISING
+            if (cpi->oxcf.noise_sensitivity && cm->frame_type != KEY_FRAME) {
+                // Use the denoised buffer for selecting base loop filter level.
+                // Denoised signal for current frame is stored in INTRA_FRAME.
+                // No denoising on key frames.
+                vp8cx_pick_filter_level(
+                    &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi);
+            } else {
+                vp8cx_pick_filter_level(cpi->Source, cpi);
+            }
+#else
             vp8cx_pick_filter_level(cpi->Source, cpi);
+#endif
+        }
+
 
         if (cm->filter_level > 0)
         {
@@ -3447,7 +3587,9 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm)
         sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
 #endif
 
-    if (cm->filter_level > 0)
+    // No need to apply loop-filter if the encoded frame does not update
+    // any reference buffers.
+    if (cm->filter_level > 0 && update_any_ref_buffers)
     {
         vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, frame_type);
     }
@@ -3577,39 +3719,78 @@ static void encode_frame_to_data_rate
     }
 
 #if CONFIG_MULTI_RES_ENCODING
-    /* In multi-resolution encoding, frame_type is decided by lowest-resolution
-     * encoder. Same frame_type is adopted while encoding at other resolution.
-     */
-    if (cpi->oxcf.mr_encoder_id)
-    {
-        LOWER_RES_FRAME_INFO* low_res_frame_info
-                        = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
+    if (cpi->oxcf.mr_total_resolutions > 1) {
+      LOWER_RES_FRAME_INFO* low_res_frame_info
+         = (LOWER_RES_FRAME_INFO*)cpi->oxcf.mr_low_res_mode_info;
 
+      if (cpi->oxcf.mr_encoder_id) {
+
+        // TODO(marpan): This constraint shouldn't be needed, as we would like
+        // to allow for key frame setting (forced or periodic) defined per
+        // spatial layer. For now, keep this in.
         cm->frame_type = low_res_frame_info->frame_type;
 
+        // Check if lower resolution is available for motion vector reuse.
         if(cm->frame_type != KEY_FRAME)
         {
-            cpi->mr_low_res_mv_avail = 1;
-            cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
-
-            if (cpi->ref_frame_flags & VP8_LAST_FRAME)
-                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
-                         == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
-
-            if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
-                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
-                         == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+          cpi->mr_low_res_mv_avail = 1;
+          cpi->mr_low_res_mv_avail &= !(low_res_frame_info->is_frame_dropped);
+
+          if (cpi->ref_frame_flags & VP8_LAST_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[LAST_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[LAST_FRAME]);
+
+          if (cpi->ref_frame_flags & VP8_GOLD_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[GOLDEN_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[GOLDEN_FRAME]);
+
+          // Don't use altref to determine whether low res is available.
+          // TODO (marpan): Should we make this type of condition on a
+          // per-reference frame basis?
+          /*
+          if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
+              cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
+                       == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+          */
+        }
+      }
 
-            if (cpi->ref_frame_flags & VP8_ALTR_FRAME)
-                cpi->mr_low_res_mv_avail &= (cpi->current_ref_frames[ALTREF_FRAME]
-                         == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
+      // On a key frame: For the lowest resolution, keep track of the key frame
+      // counter value. For the higher resolutions, reset the current video
+      // frame counter to that of the lowest resolution.
+      // This is done to the handle the case where we may stop/start encoding
+      // higher layer(s). The restart-encoding of higher layer is only signaled
+      // by a key frame for now.
+      // TODO (marpan): Add flag to indicate restart-encoding of higher layer.
+      if (cm->frame_type == KEY_FRAME) {
+        if (cpi->oxcf.mr_encoder_id) {
+          // If the initial starting value of the buffer level is zero (this can
+          // happen because we may have not started encoding this higher stream),
+          // then reset it to non-zero value based on |starting_buffer_level|.
+          if (cpi->common.current_video_frame == 0 && cpi->buffer_level == 0) {
+            unsigned int i;
+            cpi->bits_off_target = cpi->oxcf.starting_buffer_level;
+            cpi->buffer_level = cpi->oxcf.starting_buffer_level;
+            for (i = 0; i < cpi->oxcf.number_of_layers; i++) {
+              LAYER_CONTEXT *lc = &cpi->layer_context[i];
+              lc->bits_off_target = lc->starting_buffer_level;
+              lc->buffer_level = lc->starting_buffer_level;
+            }
+          }
+          cpi->common.current_video_frame =
+              low_res_frame_info->key_frame_counter_value;
+        } else {
+          low_res_frame_info->key_frame_counter_value =
+              cpi->common.current_video_frame;
         }
+      }
+
     }
 #endif
 
     // Find the reference frame closest to the current frame.
     cpi->closest_reference_frame = LAST_FRAME;
-    if (cm->frame_type != KEY_FRAME) {
+    if(cm->frame_type != KEY_FRAME) {
       int i;
       MV_REFERENCE_FRAME closest_ref = INTRA_FRAME;
       if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
@@ -3619,12 +3800,12 @@ static void encode_frame_to_data_rate
       } else if (cpi->ref_frame_flags & VP8_ALTR_FRAME) {
         closest_ref = ALTREF_FRAME;
       }
-      for (i = 1; i <= 3; i++) {
+      for(i = 1; i <= 3; i++) {
         vpx_ref_frame_type_t ref_frame_type = (vpx_ref_frame_type_t)
             ((i == 3) ? 4 : i);
         if (cpi->ref_frame_flags & ref_frame_type) {
           if ((cm->current_video_frame - cpi->current_ref_frames[i]) <
-            (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
+              (cm->current_video_frame - cpi->current_ref_frames[closest_ref])) {
             closest_ref = i;
           }
         }
@@ -3650,7 +3831,9 @@ static void encode_frame_to_data_rate
         }
 
         // Reset the zero_last counter to 0 on key frame.
-        vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+        memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+        memset(cpi->consec_zero_last_mvbias, 0,
+               (cpi->common.mb_rows * cpi->common.mb_cols));
     }
 
 #if 0
@@ -4179,8 +4362,10 @@ static void encode_frame_to_data_rate
                 else
                   disable_segmentation(cpi);
               }
-              // Reset the consec_zero_last counter on key frame.
-              vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+              // Reset the zero_last counter to 0 on key frame.
+              memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
+              memset(cpi->consec_zero_last_mvbias, 0,
+                     (cpi->common.mb_rows * cpi->common.mb_cols));
               vp8_set_quantizer(cpi, Q);
             }
 
@@ -4203,7 +4388,7 @@ static void encode_frame_to_data_rate
             if (cm->refresh_entropy_probs == 0)
             {
                 /* save a copy for later refresh */
-                vpx_memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
+                memcpy(&cm->lfc, &cm->fc, sizeof(cm->fc));
             }
 
             vp8_update_coef_context(cpi);
@@ -4613,6 +4798,22 @@ static void encode_frame_to_data_rate
     cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
 
 #if CONFIG_TEMPORAL_DENOISING
+    // Get some measure of the amount of noise, by measuring the (partial) mse
+    // between source and denoised buffer, for y channel. Partial refers to
+    // computing the sse for a sub-sample of the frame (i.e., skip x blocks along row/column),
+    // and only for blocks in that set that are consecutive ZEROMV_LAST mode.
+    // Do this every ~8 frames, to further reduce complexity.
+    // TODO(marpan): Keep this for now for the case cpi->oxcf.noise_sensitivity < 4,
+    // should be removed in favor of the process_denoiser_mode_change() function below.
+    if (cpi->oxcf.noise_sensitivity > 0 &&
+       cpi->oxcf.noise_sensitivity < 4 &&
+       !cpi->oxcf.screen_content_mode &&
+       cpi->frames_since_key%8 == 0 &&
+       cm->frame_type != KEY_FRAME) {
+       cpi->mse_source_denoised = measure_square_diff_partial(
+           &cpi->denoiser.yv12_running_avg[INTRA_FRAME], cpi->Source, cpi);
+    }
+
     // For the adaptive denoising mode (noise_sensitivity == 4), sample the mse
     // of source diff (between current and previous frame), and determine if we
     // should switch the denoiser mode. Sampling refers to computing the mse for
@@ -4621,6 +4822,7 @@ static void encode_frame_to_data_rate
     // constraint on the sum diff between blocks. This process is called every
     // ~8 frames, to further reduce complexity.
     if (cpi->oxcf.noise_sensitivity == 4 &&
+        !cpi->oxcf.screen_content_mode &&
         cpi->frames_since_key % 8 == 0 &&
         cm->frame_type != KEY_FRAME) {
       process_denoiser_mode_change(cpi);
@@ -4758,6 +4960,13 @@ static void encode_frame_to_data_rate
     if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
         cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
 
+    // If the frame dropper is not enabled, don't let the buffer level go below
+    // some threshold, given here by -|maximum_buffer_size|. For now we only do
+    // this for screen content input.
+    if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+        cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size)
+        cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
+
     /* Rolling monitors of whether we are over or underspending used to
      * help regulate min and Max Q in two pass.
      */
@@ -5232,7 +5441,26 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
                 cpi->ref_framerate = 10000000.0 / avg_duration;
             }
-
+#if CONFIG_MULTI_RES_ENCODING
+            if (cpi->oxcf.mr_total_resolutions > 1) {
+              LOWER_RES_FRAME_INFO* low_res_frame_info = (LOWER_RES_FRAME_INFO*)
+                  cpi->oxcf.mr_low_res_mode_info;
+              // Frame rate should be the same for all spatial layers in
+              // multi-res-encoding (simulcast), so we constrain the frame for
+              // higher layers to be that of lowest resolution. This is needed
+              // as he application may decide to skip encoding a high layer and
+              // then start again, in which case a big jump in time-stamps will
+              // be received for that high layer, which will yield an incorrect
+              // frame rate (from time-stamp adjustment in above calculation).
+              if (cpi->oxcf.mr_encoder_id) {
+                 cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+              }
+              else {
+                // Keep track of frame rate for lowest resolution.
+                low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+              }
+            }
+#endif
             if (cpi->oxcf.number_of_layers > 1)
             {
                 unsigned int i;
@@ -5262,8 +5490,12 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
         update_layer_contexts (cpi);
 
         /* Restore layer specific context & set frame rate */
-        layer = cpi->oxcf.layer_id[
-                cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
+        if (cpi->temporal_layer_id >= 0) {
+          layer = cpi->temporal_layer_id;
+        } else {
+          layer = cpi->oxcf.layer_id[
+                  cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
+        }
         restore_layer_context (cpi, layer);
         vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
     }
@@ -5382,19 +5614,19 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
 
     if (cm->refresh_entropy_probs == 0)
     {
-        vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+        memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
     }
 
     /* Save the contexts separately for alt ref, gold and last. */
     /* (TODO jbb -> Optimize this with pointers to avoid extra copies. ) */
     if(cm->refresh_alt_ref_frame)
-        vpx_memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
+        memcpy(&cpi->lfc_a, &cm->fc, sizeof(cm->fc));
 
     if(cm->refresh_golden_frame)
-        vpx_memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
+        memcpy(&cpi->lfc_g, &cm->fc, sizeof(cm->fc));
 
     if(cm->refresh_last_frame)
-        vpx_memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
+        memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
 
     /* if its a dropped frame honor the requests on subsequent frames */
     if (*size > 0)
@@ -5439,19 +5671,23 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                 double frame_psnr;
                 YV12_BUFFER_CONFIG      *orig = cpi->Source;
                 YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
-                int y_samples = orig->y_height * orig->y_width ;
-                int uv_samples = orig->uv_height * orig->uv_width ;
+                unsigned int y_width = cpi->common.Width;
+                unsigned int y_height = cpi->common.Height;
+                unsigned int uv_width = (y_width + 1) / 2;
+                unsigned int uv_height = (y_height + 1) / 2;
+                int y_samples = y_height * y_width;
+                int uv_samples = uv_height * uv_width;
                 int t_samples = y_samples + 2 * uv_samples;
                 double sq_error;
 
                 ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                  recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height);
+                  recon->y_buffer, recon->y_stride, y_width, y_height);
 
                 ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                  recon->u_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+                  recon->u_buffer, recon->uv_stride, uv_width, uv_height);
 
                 ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                  recon->v_buffer, recon->uv_stride, orig->uv_width, orig->uv_height);
+                  recon->v_buffer, recon->uv_stride, uv_width, uv_height);
 
                 sq_error = (double)(ye + ue + ve);
 
@@ -5473,13 +5709,13 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, unsigned l
                     vp8_clear_system_state();
 
                     ye = calc_plane_error(orig->y_buffer, orig->y_stride,
-                      pp->y_buffer, pp->y_stride, orig->y_width, orig->y_height);
+                      pp->y_buffer, pp->y_stride, y_width, y_height);
 
                     ue = calc_plane_error(orig->u_buffer, orig->uv_stride,
-                      pp->u_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+                      pp->u_buffer, pp->uv_stride, uv_width, uv_height);
 
                     ve = calc_plane_error(orig->v_buffer, orig->uv_stride,
-                      pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height);
+                      pp->v_buffer, pp->uv_stride, uv_width, uv_height);
 
                     sq_error2 = (double)(ye + ue + ve);
 
@@ -5606,6 +5842,7 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla
         cpi->common.show_frame_mi = cpi->common.mi;
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
 #else
+        (void)flags;
 
         if (cpi->common.frame_to_show)
         {
@@ -5698,7 +5935,7 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, uns
     {
         if (map)
         {
-            vpx_memcpy(cpi->active_map, map, rows * cols);
+            memcpy(cpi->active_map, map, rows * cols);
             cpi->active_map_enabled = 1;
         }
         else
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
index f0424e69ca5..82d7453902c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h
@@ -513,10 +513,18 @@ typedef struct VP8_COMP
     signed char *cyclic_refresh_map;
     // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
     unsigned char *consec_zero_last;
+    // Counter that is reset when a block is checked for a mode-bias against
+    // ZEROMV_LASTREF.
+    unsigned char *consec_zero_last_mvbias;
 
     // Frame counter for the temporal pattern. Counter is rest when the temporal
     // layers are changed dynamically (run-time change).
     unsigned int temporal_pattern_counter;
+    // Temporal layer id.
+    int temporal_layer_id;
+
+    // Measure of average squared difference between source and denoised signal.
+    int mse_source_denoised;
 
 #if CONFIG_MULTITHREAD
     /* multithread data */
@@ -657,6 +665,9 @@ typedef struct VP8_COMP
 
     int droppable;
 
+    int initial_width;
+    int initial_height;
+
 #if CONFIG_TEMPORAL_DENOISING
     VP8_DENOISER denoiser;
 #endif
@@ -687,6 +698,7 @@ typedef struct VP8_COMP
 #endif
     /* The frame number of each reference frames */
     unsigned int current_ref_frames[MAX_REF_FRAMES];
+    // Closest reference frame to current frame.
     MV_REFERENCE_FRAME closest_reference_frame;
 
     struct rd_costs_struct
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
index 9d5556dcdee..c4c0e7e9e23 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/pickinter.c
@@ -40,6 +40,134 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
 extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
 
+// Fixed point implementation of a skin color classifier. Skin color
+// is model by a Gaussian distribution in the CbCr color space.
+// See ../../test/skin_color_detector_test.cc where the reference
+// skin color classifier is defined.
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold = 1570636;                    // q18
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr)
+{
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+static int macroblock_corner_grad(unsigned char* signal, int stride,
+                                  int offsetx, int offsety, int sgnx, int sgny)
+{
+  int y1 = signal[offsetx * stride + offsety];
+  int y2 = signal[offsetx * stride + offsety + sgny];
+  int y3 = signal[(offsetx + sgnx) * stride + offsety];
+  int y4 = signal[(offsetx + sgnx) * stride + offsety + sgny];
+  return MAX(MAX(abs(y1 - y2), abs(y1 - y3)), abs(y1 - y4));
+}
+
+static int check_dot_artifact_candidate(VP8_COMP *cpi,
+                                        MACROBLOCK *x,
+                                        unsigned char *target_last,
+                                        int stride,
+                                        unsigned char* last_ref,
+                                        int mb_row,
+                                        int mb_col,
+                                        int channel)
+{
+  int threshold1 = 6;
+  int threshold2 = 3;
+  unsigned int max_num = (cpi->common.MBs) / 10;
+  int grad_last = 0;
+  int grad_source = 0;
+  int index = mb_row * cpi->common.mb_cols + mb_col;
+  // Threshold for #consecutive (base layer) frames using zero_last mode.
+  int num_frames = 30;
+  int shift = 15;
+  if (channel > 0) {
+    shift = 7;
+  }
+  if (cpi->oxcf.number_of_layers > 1)
+  {
+    num_frames = 20;
+  }
+  x->zero_last_dot_suppress = 0;
+  // Blocks on base layer frames that have been using ZEROMV_LAST repeatedly
+  // (i.e, at least |x| consecutive frames are candidates for increasing the
+  // rd adjustment for zero_last mode.
+  // Only allow this for at most |max_num| blocks per frame.
+  // Don't allow this for screen content input.
+  if (cpi->current_layer == 0 &&
+      cpi->consec_zero_last_mvbias[index] > num_frames &&
+      x->mbs_zero_last_dot_suppress < max_num &&
+      !cpi->oxcf.screen_content_mode)
+  {
+    // If this block is checked here, label it so we don't check it again until
+    // ~|x| framaes later.
+    x->zero_last_dot_suppress = 1;
+    // Dot artifact is noticeable as strong gradient at corners of macroblock,
+    // for flat areas. As a simple detector for now, we look for a high
+    // corner gradient on last ref, and a smaller gradient on source.
+    // Check 4 corners, return if any satisfy condition.
+    // Top-left:
+    grad_last = macroblock_corner_grad(last_ref, stride, 0, 0, 1, 1);
+    grad_source = macroblock_corner_grad(target_last, stride, 0, 0, 1, 1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+       x->mbs_zero_last_dot_suppress++;
+       return 1;
+    }
+    // Top-right:
+    grad_last = macroblock_corner_grad(last_ref, stride, 0, shift, 1, -1);
+    grad_source = macroblock_corner_grad(target_last, stride, 0, shift, 1, -1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    // Bottom-left:
+    grad_last = macroblock_corner_grad(last_ref, stride, shift, 0, -1, 1);
+    grad_source = macroblock_corner_grad(target_last, stride, shift, 0, -1, 1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    // Bottom-right:
+    grad_last = macroblock_corner_grad(last_ref, stride, shift, shift, -1, -1);
+    grad_source = macroblock_corner_grad(target_last, stride, shift, shift, -1, -1);
+    if (grad_last >= threshold1 && grad_source <= threshold2)
+    {
+      x->mbs_zero_last_dot_suppress++;
+      return 1;
+    }
+    return 0;
+  }
+  return 0;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+static int is_skin_color(int y, int cb, int cr)
+{
+  if (y < 40 || y > 220)
+  {
+    return 0;
+  }
+  return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
+
 int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
                                 int_mv *bestmv, int_mv *ref_mv,
                                 int error_per_bit,
@@ -52,6 +180,7 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
     (void) ref_mv;
     (void) error_per_bit;
     (void) vfp;
+    (void) mb;
     (void) mvcost;
     (void) distortion;
     (void) sse;
@@ -514,10 +643,17 @@ static int evaluate_inter_mode(unsigned int* sse, int rate2, int* distortion2,
 #endif
 
     // Adjust rd for ZEROMV and LAST, if LAST is the closest reference frame.
-    if (this_mode == ZEROMV &&
-        x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
-        (denoise_aggressive || cpi->closest_reference_frame == LAST_FRAME)) {
-      this_rd = ((int64_t)this_rd) * rd_adj / 100;
+    // TODO: We should also add condition on distance of closest to current.
+    if(!cpi->oxcf.screen_content_mode &&
+       this_mode == ZEROMV &&
+       x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME &&
+       (denoise_aggressive || (cpi->closest_reference_frame == LAST_FRAME)))
+    {
+        // No adjustment if block is considered to be skin area.
+        if(x->is_skin)
+            rd_adj = 100;
+
+        this_rd = ((int64_t)this_rd) * rd_adj / 100;
     }
 
     check_for_encode_breakout(*sse, x);
@@ -597,6 +733,15 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #endif
 
     int sf_improved_mv_pred = cpi->sf.improved_mv_pred;
+
+#if CONFIG_MULTI_RES_ENCODING
+    int dissim = INT_MAX;
+    int parent_ref_frame = 0;
+    int_mv parent_ref_mv;
+    MB_PREDICTION_MODE parent_mode = 0;
+    int parent_ref_valid = 0;
+#endif
+
     int_mv mvp;
 
     int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7};
@@ -607,14 +752,56 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     unsigned char *plane[4][3];
     int ref_frame_map[4];
     int sign_bias = 0;
+    int dot_artifact_candidate = 0;
+    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
 
-#if CONFIG_MULTI_RES_ENCODING
-    int dissim = INT_MAX;
-    int parent_ref_frame = 0;
-    int parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
-    int_mv parent_ref_mv;
-    MB_PREDICTION_MODE parent_mode = 0;
+    // If the current frame is using LAST as a reference, check for
+    // biasing the mode selection for dot artifacts.
+    if (cpi->ref_frame_flags & VP8_LAST_FRAME) {
+      unsigned char* target_y = x->src.y_buffer;
+      unsigned char* target_u = x->block[16].src + *x->block[16].base_src;
+      unsigned char* target_v = x->block[20].src + *x->block[20].base_src;
+      int stride = x->src.y_stride;
+      int stride_uv = x->block[16].src_stride;
+#if CONFIG_TEMPORAL_DENOISING
+      if (cpi->oxcf.noise_sensitivity) {
+        const int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
+        target_y =
+            cpi->denoiser.yv12_running_avg[LAST_FRAME].y_buffer + recon_yoffset;
+        stride = cpi->denoiser.yv12_running_avg[LAST_FRAME].y_stride;
+        if (uv_denoise) {
+          target_u =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].u_buffer +
+                  recon_uvoffset;
+          target_v =
+              cpi->denoiser.yv12_running_avg[LAST_FRAME].v_buffer +
+                  recon_uvoffset;
+          stride_uv = cpi->denoiser.yv12_running_avg[LAST_FRAME].uv_stride;
+        }
+      }
+#endif
+      dot_artifact_candidate =
+          check_dot_artifact_candidate(cpi, x, target_y, stride,
+              plane[LAST_FRAME][0], mb_row, mb_col, 0);
+      // If not found in Y channel, check UV channel.
+      if (!dot_artifact_candidate) {
+        dot_artifact_candidate =
+            check_dot_artifact_candidate(cpi, x, target_u, stride_uv,
+                plane[LAST_FRAME][1], mb_row, mb_col, 1);
+        if (!dot_artifact_candidate) {
+          dot_artifact_candidate =
+              check_dot_artifact_candidate(cpi, x, target_v, stride_uv,
+                  plane[LAST_FRAME][2], mb_row, mb_col, 2);
+        }
+      }
+    }
 
+#if CONFIG_MULTI_RES_ENCODING
+    // |parent_ref_valid| will be set here if potentially we can do mv resue for
+    // this higher resol (|cpi->oxcf.mr_encoder_id| > 0) frame.
+    // |parent_ref_valid| may be reset depending on |parent_ref_frame| for
+    // the current macroblock below.
+    parent_ref_valid = cpi->oxcf.mr_encoder_id && cpi->mr_low_res_mv_avail;
     if (parent_ref_valid)
     {
         int parent_ref_flag;
@@ -632,24 +819,51 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
          * In this event, take the conservative approach of disabling the
          * lower res info for this MB.
          */
+
         parent_ref_flag = 0;
+        // Note availability for mv reuse is only based on last and golden.
         if (parent_ref_frame == LAST_FRAME)
             parent_ref_flag = (cpi->ref_frame_flags & VP8_LAST_FRAME);
         else if (parent_ref_frame == GOLDEN_FRAME)
             parent_ref_flag = (cpi->ref_frame_flags & VP8_GOLD_FRAME);
-        else if (parent_ref_frame == ALTREF_FRAME)
-            parent_ref_flag = (cpi->ref_frame_flags & VP8_ALTR_FRAME);
 
         //assert(!parent_ref_frame || parent_ref_flag);
+
+        // If |parent_ref_frame| did not match either last or golden then
+        // shut off mv reuse.
         if (parent_ref_frame && !parent_ref_flag)
             parent_ref_valid = 0;
+
+        // Don't do mv reuse since we want to allow for another mode besides
+        // ZEROMV_LAST to remove dot artifact.
+        if (dot_artifact_candidate)
+          parent_ref_valid = 0;
+    }
+#endif
+
+    // Check if current macroblock is in skin area.
+    {
+    const int y = x->src.y_buffer[7 * x->src.y_stride + 7];
+    const int cb = x->src.u_buffer[3 * x->src.uv_stride + 3];
+    const int cr = x->src.v_buffer[3 * x->src.uv_stride + 3];
+    x->is_skin = 0;
+    if (!cpi->oxcf.screen_content_mode)
+      x->is_skin = is_skin_color(y, cb, cr);
+    }
+#if CONFIG_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity) {
+      // Under aggressive denoising mode, should we use skin map to reduce denoiser
+      // and ZEROMV bias? Will need to revisit the accuracy of this detection for
+      // very noisy input. For now keep this as is (i.e., don't turn it off). 
+      // if (cpi->denoiser.denoiser_mode == kDenoiserOnYUVAggressive)
+      //   x->is_skin = 0;
     }
 #endif
 
     mode_mv = mode_mv_sb[sign_bias];
     best_ref_mv.as_int = 0;
-    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
-    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+    memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    memset(&best_mbmode, 0, sizeof(best_mbmode));
 
     /* Setup search priorities */
 #if CONFIG_MULTI_RES_ENCODING
@@ -680,8 +894,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         best_ref_mv.as_int = best_ref_mv_sb[sign_bias].as_int;
     }
 
-    get_predictor_pointers(cpi, plane, recon_yoffset, recon_uvoffset);
-
     /* Count of the number of MBs tested so far this frame */
     x->mbs_tested_so_far++;
 
@@ -691,9 +903,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
     /* If the frame has big static background and current MB is in low
-     * motion area, its mode decision is biased to ZEROMV mode.
-     */
-    calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+    *  motion area, its mode decision is biased to ZEROMV mode.
+    *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12). 
+    *  At such speed settings, ZEROMV is already heavily favored.
+    */
+    if (cpi->Speed < 12) {
+      calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+    }
 
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
@@ -702,6 +918,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     }
 #endif
 
+    if (dot_artifact_candidate)
+    {
+        // Bias against ZEROMV_LAST mode.
+        rd_adjustment = 150;
+    }
+
+
     /* if we encode a new mv this is important
      * find the best new motion vector
      */
@@ -887,14 +1110,17 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             step_param = cpi->sf.first_step + speed_adjust;
 
 #if CONFIG_MULTI_RES_ENCODING
-            /* If lower-res drops this frame, then higher-res encoder does
-               motion search without any previous knowledge. Also, since
-               last frame motion info is not stored, then we can not
+            /* If lower-res frame is not available for mv reuse (because of
+               frame dropping or different temporal layer pattern), then higher
+               resol encoder does motion search without any previous knowledge.
+               Also, since last frame motion info is not stored, then we can not
                use improved_mv_pred. */
-            if (cpi->oxcf.mr_encoder_id && !parent_ref_valid)
+            if (cpi->oxcf.mr_encoder_id)
                 sf_improved_mv_pred = 0;
 
-            if (parent_ref_valid && parent_ref_frame)
+            // Only use parent MV as predictor if this candidate reference frame
+            // (|this_ref_frame|) is equal to |parent_ref_frame|.
+            if (parent_ref_valid && (parent_ref_frame == this_ref_frame))
             {
                 /* Use parent MV as predictor. Adjust search range
                  * accordingly.
@@ -938,7 +1164,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             }
 
 #if CONFIG_MULTI_RES_ENCODING
-            if (parent_ref_valid && parent_ref_frame && dissim <= 2 &&
+            if (parent_ref_valid && (parent_ref_frame == this_ref_frame) &&
+                dissim <= 2 &&
                 MAX(abs(best_ref_mv.as_mv.row - parent_ref_mv.as_mv.row),
                     abs(best_ref_mv.as_mv.col - parent_ref_mv.as_mv.col)) <= 4)
             {
@@ -975,10 +1202,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                  * change the behavior in lowest-resolution encoder.
                  * Will improve it later.
                  */
-                 /* Set step_param to 0 to ensure large-range motion search
-                    when encoder drops this frame at lower-resolution.
-                  */
-                if (!parent_ref_valid)
+                /* Set step_param to 0 to ensure large-range motion search
+                 * when mv reuse if not valid (i.e. |parent_ref_valid| = 0),
+                 * or if this candidate reference frame (|this_ref_frame|) is
+                 * not equal to |parent_ref_frame|.
+                 */
+                if (!parent_ref_valid || (parent_ref_frame != this_ref_frame))
                     step_param = 0;
 #endif
                     bestsme = vp8_hex_search(x, b, d, &mvp_full, &d->bmi.mv,
@@ -1080,7 +1309,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
         if (cpi->oxcf.noise_sensitivity)
         {
-
             /* Store for later use by denoiser. */
             // Dont' denoise with GOLDEN OR ALTREF is they are old reference
             // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
@@ -1096,7 +1324,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                         x->e_mbd.mode_info_context->mbmi.ref_frame;
             }
 
-            /* Store the best NEWMV in x for later use in the denoiser. */
+            // Store the best NEWMV in x for later use in the denoiser.
             if (x->e_mbd.mode_info_context->mbmi.mode == NEWMV &&
                 sse < best_sse && !skip_old_reference)
             {
@@ -1120,8 +1348,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             *returndistortion = distortion2;
             best_rd_sse = sse;
             best_rd = this_rd;
-            vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
-                       sizeof(MB_MODE_INFO));
+            memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                   sizeof(MB_MODE_INFO));
 
             /* Testing this mode gave rise to an improvement in best error
              * score. Lower threshold a bit for next time
@@ -1184,6 +1412,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     if (cpi->oxcf.noise_sensitivity)
     {
         int block_index = mb_row * cpi->common.mb_cols + mb_col;
+        int reevaluate = 0;
+        int is_noisy = 0;
         if (x->best_sse_inter_mode == DC_PRED)
         {
             /* No best MV found. */
@@ -1193,18 +1423,52 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             x->best_reference_frame = best_mbmode.ref_frame;
             best_sse = best_rd_sse;
         }
+        // For non-skin blocks that have selected ZEROMV for this current frame,
+        // and have been selecting ZEROMV_LAST (on the base layer frame) at
+        // least |x~20| consecutive past frames in a row, label the block for
+        // possible increase in denoising strength. We also condition this
+        // labeling on there being significant denoising in the scene
+        if  (cpi->oxcf.noise_sensitivity == 4) {
+          if (cpi->denoiser.nmse_source_diff >
+              70 * cpi->denoiser.threshold_aggressive_mode / 100)
+            is_noisy = 1;
+        } else {
+          if (cpi->mse_source_denoised > 1000)
+            is_noisy = 1;
+        }
         x->increase_denoising = 0;
+        if (!x->is_skin &&
+            x->best_sse_inter_mode == ZEROMV &&
+            (x->best_reference_frame == LAST_FRAME ||
+            x->best_reference_frame == cpi->closest_reference_frame) &&
+            cpi->consec_zero_last[block_index] >= 20 &&
+            is_noisy) {
+            x->increase_denoising = 1;
+        }
+        x->denoise_zeromv = 0;
         vp8_denoiser_denoise_mb(&cpi->denoiser, x, best_sse, zero_mv_sse,
                                 recon_yoffset, recon_uvoffset,
                                 &cpi->common.lf_info, mb_row, mb_col,
                                 block_index);
 
-        /* Reevaluate ZEROMV after denoising. */
-        if (best_mbmode.ref_frame == INTRA_FRAME &&
+        // Reevaluate ZEROMV after denoising: for large noise content
+        // (i.e., cpi->mse_source_denoised is above threshold), do this for all
+        // blocks that did not pick ZEROMV as best mode but are using ZEROMV
+        // for denoising. Otherwise, always re-evaluate for blocks that picked
+        // INTRA mode as best mode.
+        // Avoid blocks that have been biased against ZERO_LAST
+        // (i.e., dot artifact candidate blocks).
+        reevaluate = (best_mbmode.ref_frame == INTRA_FRAME) ||
+                     (best_mbmode.mode != ZEROMV &&
+                      x->denoise_zeromv &&
+                      cpi->mse_source_denoised > 2000);
+        if (!dot_artifact_candidate &&
+            reevaluate &&
             x->best_zeromv_reference_frame != INTRA_FRAME)
         {
             int this_rd = 0;
             int this_ref_frame = x->best_zeromv_reference_frame;
+            rd_adjustment = 100;
             rate2 = x->ref_frame_cost[this_ref_frame] +
                     vp8_cost_mv_ref(ZEROMV, mdcounts);
             distortion2 = 0;
@@ -1223,8 +1487,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
             if (this_rd < best_rd)
             {
-                vpx_memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
-                           sizeof(MB_MODE_INFO));
+                memcpy(&best_mbmode, &x->e_mbd.mode_info_context->mbmi,
+                       sizeof(MB_MODE_INFO));
             }
         }
 
@@ -1248,8 +1512,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     /* set to the best mb mode, this copy can be skip if x->skip since it
      * already has the right content */
     if (!x->skip)
-        vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
-                   sizeof(MB_MODE_INFO));
+        memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode,
+               sizeof(MB_MODE_INFO));
 
     if (best_mbmode.mode <= B_PRED)
     {
@@ -1264,7 +1528,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     update_mvcount(x, &best_ref_mv);
 }
 
-
 void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
 {
     int error4x4, error16x16 = INT_MAX;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c
index f0c8f28fc96..890053dcfdc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/picklpf.c
@@ -49,7 +49,7 @@ static void yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
     src_y = src_ybc->y_buffer + yoffset;
     dst_y = dst_ybc->y_buffer + yoffset;
 
-    vpx_memcpy(dst_y, src_y, ystride * linestocopy);
+    memcpy(dst_y, src_y, ystride * linestocopy);
 }
 
 static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
@@ -142,7 +142,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
     int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
     int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
     int filt_val;
-    int best_filt_val = cm->filter_level;
+    int best_filt_val;
     YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
 
     /* Replace unfiltered frame buffer with a new one */
@@ -274,8 +274,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
     int filter_step;
     int filt_high = 0;
-    /* Start search at previous frame filter level */
-    int filt_mid = cm->filter_level;
+    int filt_mid;
     int filt_low = 0;
     int filt_best;
     int filt_direction = 0;
@@ -287,7 +286,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
 
     YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
 
-    vpx_memset(ss_err, 0, sizeof(ss_err));
+    memset(ss_err, 0, sizeof(ss_err));
 
     /* Replace unfiltered frame buffer with a new one */
     cm->frame_to_show = &cpi->pick_lf_lvl_frame;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/csystemdependent.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/csystemdependent.c
deleted file mode 100644
index 63f23578467..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/csystemdependent.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *coeff, short *dqcoeff);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// ppc
-extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp8_sad16x16_ppc;
-extern SADFunction vp8_sad16x8_ppc;
-extern SADFunction vp8_sad8x16_ppc;
-extern SADFunction vp8_sad8x8_ppc;
-extern SADFunction vp8_sad4x4_ppc;
-
-extern variance_function vp8_variance16x16_ppc;
-extern variance_function vp8_variance8x16_ppc;
-extern variance_function vp8_variance16x8_ppc;
-extern variance_function vp8_variance8x8_ppc;
-extern variance_function vp8_variance4x4_ppc;
-extern variance_function vp8_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-
-void vp8_cmachine_specific_config(void)
-{
-    // Pure C:
-    vp8_mbuverror               = vp8_mbuverror_c;
-    vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
-    vp8_short_fdct4x4            = vp8_short_fdct4x4_ppc;
-    vp8_short_fdct8x4            = vp8_short_fdct8x4_ppc;
-    vp8_fast_fdct4x4             = vp8_short_fdct4x4_ppc;
-    vp8_fast_fdct8x4             = vp8_short_fdct8x4_ppc;
-    short_walsh4x4               = vp8_short_walsh4x4_c;
-
-    vp8_variance4x4             = vp8_variance4x4_ppc;
-    vp8_variance8x8             = vp8_variance8x8_ppc;
-    vp8_variance8x16            = vp8_variance8x16_ppc;
-    vp8_variance16x8            = vp8_variance16x8_ppc;
-    vp8_variance16x16           = vp8_variance16x16_ppc;
-    vp8_mse16x16                = vp8_mse16x16_ppc;
-
-    vp8_sub_pixel_variance4x4     = vp8_sub_pixel_variance4x4_ppc;
-    vp8_sub_pixel_variance8x8     = vp8_sub_pixel_variance8x8_ppc;
-    vp8_sub_pixel_variance8x16    = vp8_sub_pixel_variance8x16_ppc;
-    vp8_sub_pixel_variance16x8    = vp8_sub_pixel_variance16x8_ppc;
-    vp8_sub_pixel_variance16x16   = vp8_sub_pixel_variance16x16_ppc;
-
-    vp8_get_mb_ss                 = vp8_get_mb_ss_c;
-    vp8_get4x4sse_cs            = vp8_get4x4sse_cs_c;
-
-    vp8_sad16x16                = vp8_sad16x16_ppc;
-    vp8_sad16x8                 = vp8_sad16x8_ppc;
-    vp8_sad8x16                 = vp8_sad8x16_ppc;
-    vp8_sad8x8                  = vp8_sad8x8_ppc;
-    vp8_sad4x4                  = vp8_sad4x4_ppc;
-
-    vp8_block_error              = vp8_block_error_ppc;
-    vp8_mbblock_error            = vp8_mbblock_error_c;
-
-    vp8_subtract_b               = vp8_subtract_b_c;
-    vp8_subtract_mby             = vp8_subtract_mby_ppc;
-    vp8_subtract_mbuv            = vp8_subtract_mbuv_ppc;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/encodemb_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
deleted file mode 100644
index 6e0099ddc88..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/encodemb_altivec.asm
+++ /dev/null
@@ -1,153 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_subtract_mbuv_ppc
-    .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r9, 256
-    add     r3, r3, r9
-    add     r3, r3, r9
-    add     r6, r6, r9
-
-    li      r10, 16
-    li      r9,  4
-    mtctr   r9
-
-    vspltisw v0, 0
-
-mbu_loop:
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r4, r4, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-
-    add     r4, r4, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbu_loop
-
-    mtctr   r9
-
-mbv_loop:
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r5, r5, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-
-    add     r5, r5, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbv_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-    vspltisw v0, 0
-
-mby_loop:
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r5           ;# pred
-
-    add     r4, r4, r6
-    addi    r5, r5, 16
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vmrglb  v3, v0, v1          ;# unpack low src  to short
-    vmrglb  v4, v0, v2          ;# unpack low pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mby_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/fdct_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/fdct_altivec.asm
deleted file mode 100644
index 935d0cb0977..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/fdct_altivec.asm
+++ /dev/null
@@ -1,205 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_short_fdct4x4_ppc
-    .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;#   in normalization (fwd is twice unitary, inv is half unitary)
-;#   and that they are of course transposes of each other.
-;#
-;#   The following three accomplish most of implementation and
-;#   are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    li      r6, 16
-
-    load_c v0, dct_tab, 0, r9, r10
-    lvx     v1,   r6, r10
-    addi    r10, r10, 32
-    lvx     v2,    0, r10
-    lvx     v3,   r6, r10
-
-    load_c v4, ppc_dctperm_tab,  0, r9, r10
-    load_c v5, ppc_dctperm_tab, r6, r9, r10
-
-    load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
-;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
-;#   For fwd transform, indices are horizontal positions, then frequencies.
-;#   For inverse transform, frequencies then positions.
-;#   The two resulting  A0..A3  B0..B3  are later combined
-;#   and vertically transformed.
-
-.macro two_rows_horiz Dst
-    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
-
-    vmsumshm v10, v0, v8, v6
-    vmsumshm v10, v1, v9, v10
-    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
-
-    vmsumshm v11, v2, v8, v6
-    vmsumshm v11, v3, v9, v11
-    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
-
-    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
-    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;#   forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
-    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
-    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v10, v8, v7
-
-    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
-    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v8, v8, v7
-
-    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
-.endm
-
-.macro two_rows_h Dest
-    stw     r0,  0(r8)
-    lwz     r0,  4(r3)
-    stw     r0,  4(r8)
-    lwzux   r0, r3,r5
-    stw     r0,  8(r8)
-    lwz     r0,  4(r3)
-    stw     r0, 12(r8)
-    lvx     v8,  0,r8
-    two_rows_horiz \Dest
-.endm
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8,  r1, 0
-    addi    r10, r3, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    ;# Next block
-    addi    r3, r10, 8
-    addi    r4, r4, 32
-    lvx     v6, 0, r9           ;# v6 = Hround
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .data
-    .align 4
-ppc_dctperm_tab:
-    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
-    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
-    .align 4
-dct_tab:
-    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
-    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
-    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
-    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
-    .align 4
-round_tab:
-    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
-    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/rdopt_altivec.asm b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
deleted file mode 100644
index ba482300973..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ppc/rdopt_altivec.asm
+++ /dev/null
@@ -1,51 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_block_error_ppc
-
-    .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    stw     r5, 12(r1)          ;# tranfer dc to vector register
-
-    lvx     v0, 0, r3           ;# Coeff
-    lvx     v1, 0, r4           ;# dqcoeff
-
-    li      r10, 16
-
-    vspltisw v3, 0
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v2, v0, v0, v3     ;# multiply differences
-
-    lvx     v0, r10, r3         ;# Coeff
-    lvx     v1, r10, r4         ;# dqcoeff
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v1, v0, v0, v2     ;# multiply differences
-    vsumsws v1, v1, v3          ;# sum up
-
-    stvx    v1, 0, r1
-    lwz     r3, 12(r1)          ;# return value
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c
index 9953bd686f4..c5a7bc67039 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.c
@@ -65,8 +65,8 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
     short *dequant_ptr     = d->dequant;
     short zbin_oq_value    = b->zbin_extra;
 
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
+    memset(qcoeff_ptr, 0, 32);
+    memset(dqcoeff_ptr, 0, 32);
 
     eob = -1;
 
@@ -101,7 +101,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
     *d->eob = (char)(eob + 1);
 }
 
-void vp8_quantize_mby_c(MACROBLOCK *x)
+void vp8_quantize_mby(MACROBLOCK *x)
 {
     int i;
     int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -114,7 +114,7 @@ void vp8_quantize_mby_c(MACROBLOCK *x)
         x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
 }
 
-void vp8_quantize_mb_c(MACROBLOCK *x)
+void vp8_quantize_mb(MACROBLOCK *x)
 {
     int i;
     int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -125,7 +125,7 @@ void vp8_quantize_mb_c(MACROBLOCK *x)
 }
 
 
-void vp8_quantize_mbuv_c(MACROBLOCK *x)
+void vp8_quantize_mbuv(MACROBLOCK *x)
 {
     int i;
 
@@ -133,23 +133,6 @@ void vp8_quantize_mbuv_c(MACROBLOCK *x)
         x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
 }
 
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
-    vp8_regular_quantize_b(b1, d1);
-    vp8_regular_quantize_b(b2, d2);
-}
-
-void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
-    vp8_fast_quantize_b_c(b1, d1);
-    vp8_fast_quantize_b_c(b2, d2);
-}
-
-
 static const int qrounding_factors[129] =
 {
     48, 48, 48, 48, 48, 48, 48, 48,
@@ -552,6 +535,7 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
     int update = 0;
     int new_delta_q;
+    int new_uv_delta_q;
     cm->base_qindex = Q;
 
     /* if any of the delta_q values are changing update flag has to be set */
@@ -559,8 +543,6 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
 
     cm->y1dc_delta_q = 0;
     cm->y2ac_delta_q = 0;
-    cm->uvdc_delta_q = 0;
-    cm->uvac_delta_q = 0;
 
     if (Q < 4)
     {
@@ -572,6 +554,21 @@ void vp8_set_quantizer(struct VP8_COMP *cpi, int Q)
     update |= cm->y2dc_delta_q != new_delta_q;
     cm->y2dc_delta_q = new_delta_q;
 
+    new_uv_delta_q = 0;
+    // For screen content, lower the q value for UV channel. For now, select
+    // conservative delta; same delta for dc and ac, and decrease it with lower
+    // Q, and set to 0 below some threshold. May want to condition this in
+    // future on the variance/energy in UV channel.
+    if (cpi->oxcf.screen_content_mode && Q > 40) {
+      new_uv_delta_q = -(int)(0.15 * Q);
+      // Check range: magnitude of delta is 4 bits.
+      if (new_uv_delta_q < -15) {
+        new_uv_delta_q = -15;
+      }
+    }
+    update |= cm->uvdc_delta_q != new_uv_delta_q;
+    cm->uvdc_delta_q = new_uv_delta_q;
+    cm->uvac_delta_q = new_uv_delta_q;
 
     /* Set Segment specific quatizers */
     mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
index c739b2627b6..7d36c2b45fe 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/quantize.h
@@ -18,6 +18,9 @@ extern "C" {
 
 struct VP8_COMP;
 struct macroblock;
+extern void vp8_quantize_mb(struct macroblock *x);
+extern void vp8_quantize_mby(struct macroblock *x);
+extern void vp8_quantize_mbuv(struct macroblock *x);
 extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
 extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
 extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
index c51650c3c26..25d7a4998cb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c
@@ -296,7 +296,7 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
 
     vp8_default_coef_probs(& cpi->common);
 
-    vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+    memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
     {
         int flag[2] = {1, 1};
         vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
@@ -305,9 +305,9 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
     /* Make sure we initialize separate contexts for altref,gold, and normal.
      * TODO shouldn't need 3 different copies of structure to do this!
      */
-    vpx_memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
-    vpx_memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
-    vpx_memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_a, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_g, &cpi->common.fc, sizeof(cpi->common.fc));
+    memcpy(&cpi->lfc_n, &cpi->common.fc, sizeof(cpi->common.fc));
 
     cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
 
@@ -708,7 +708,13 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
                     Adjustment = (cpi->this_frame_target - min_frame_target);
 
                 if (cpi->frames_since_golden == (cpi->current_gf_interval >> 1))
-                    cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
+                {
+                    Adjustment = (cpi->current_gf_interval - 1) * Adjustment;
+                    // Limit adjustment to 10% of current target.
+                    if (Adjustment > (10 * cpi->this_frame_target) / 100)
+                        Adjustment = (10 * cpi->this_frame_target) / 100;
+                    cpi->this_frame_target += Adjustment;
+                }
                 else
                     cpi->this_frame_target -= Adjustment;
             }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
index 2f6f5d07c81..9ccd85eb93f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/rdopt.c
@@ -555,8 +555,8 @@ static int vp8_rdcost_mby(MACROBLOCK *mb)
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
 
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -650,8 +650,8 @@ static int rd_pick_intra4x4block(
      * a temp buffer that meets the stride requirements, but we are only
      * interested in the left 4x4 block
      * */
-    DECLARE_ALIGNED_ARRAY(16, unsigned char,  best_predictor, 16*4);
-    DECLARE_ALIGNED_ARRAY(16, short, best_dqcoeff, 16);
+    DECLARE_ALIGNED(16, unsigned char,  best_predictor[16*4]);
+    DECLARE_ALIGNED(16, short, best_dqcoeff[16]);
     int dst_stride = x->e_mbd.dst.y_stride;
     unsigned char *dst = x->e_mbd.dst.y_buffer + b->offset;
 
@@ -691,7 +691,7 @@ static int rd_pick_intra4x4block(
             *a = tempa;
             *l = templ;
             copy_predictor(best_predictor, b->predictor);
-            vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+            memcpy(best_dqcoeff, b->dqcoeff, 32);
         }
     }
     b->bmi.as_mode = *best_mode;
@@ -715,8 +715,8 @@ static int rd_pick_intra4x4mby_modes(MACROBLOCK *mb, int *Rate,
     ENTROPY_CONTEXT *tl;
     const int *bmode_costs;
 
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -820,8 +820,8 @@ static int rd_cost_mbuv(MACROBLOCK *mb)
     ENTROPY_CONTEXT *ta;
     ENTROPY_CONTEXT *tl;
 
-    vpx_memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, mb->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, mb->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -837,6 +837,9 @@ static int rd_cost_mbuv(MACROBLOCK *mb)
 static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                             int *distortion, int fullpixel)
 {
+    (void)cpi;
+    (void)fullpixel;
+
     vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
     vp8_subtract_mbuv(x->src_diff,
         x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
@@ -854,6 +857,9 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
 static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
                           int *distortion, int fullpixel)
 {
+    (void)cpi;
+    (void)fullpixel;
+
     vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
     vp8_subtract_mbuv(x->src_diff,
         x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
@@ -1122,8 +1128,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
     ENTROPY_CONTEXT *ta_b;
     ENTROPY_CONTEXT *tl_b;
 
-    vpx_memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
-    vpx_memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
     ta = (ENTROPY_CONTEXT *)&t_above;
     tl = (ENTROPY_CONTEXT *)&t_left;
@@ -1166,8 +1172,8 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
             ENTROPY_CONTEXT *ta_s;
             ENTROPY_CONTEXT *tl_s;
 
-            vpx_memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
-            vpx_memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
+            memcpy(&t_above_s, &t_above, sizeof(ENTROPY_CONTEXT_PLANES));
+            memcpy(&t_left_s, &t_left, sizeof(ENTROPY_CONTEXT_PLANES));
 
             ta_s = (ENTROPY_CONTEXT *)&t_above_s;
             tl_s = (ENTROPY_CONTEXT *)&t_left_s;
@@ -1323,14 +1329,14 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
                 mode_selected = this_mode;
                 best_label_rd = this_rd;
 
-                vpx_memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-                vpx_memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+                memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
+                memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
 
             }
         } /*for each 4x4 mode*/
 
-        vpx_memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-        vpx_memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
 
         labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                     bsi->ref_mv, x->mvcost);
@@ -1386,7 +1392,7 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
     int i;
     BEST_SEG_INFO bsi;
 
-    vpx_memset(&bsi, 0, sizeof(bsi));
+    memset(&bsi, 0, sizeof(bsi));
 
     bsi.segment_rd = best_rd;
     bsi.ref_mv = best_ref_mv;
@@ -1655,7 +1661,6 @@ void vp8_mv_pred
             mv.as_mv.row = mvx[vcnt/2];
             mv.as_mv.col = mvy[vcnt/2];
 
-            find = 1;
             /* sr is set to 0 to allow calling function to decide the search
              * range.
              */
@@ -1685,16 +1690,16 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
     }else if(xd->mb_to_top_edge==0)
     {   /* only has left MB for sad calculation. */
         near_sad[0] = near_sad[2] = INT_MAX;
-        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
     }else if(xd->mb_to_left_edge ==0)
     {   /* only has left MB for sad calculation. */
         near_sad[1] = near_sad[2] = INT_MAX;
-        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
     }else
     {
-        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, UINT_MAX);
-        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, UINT_MAX);
-        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, UINT_MAX);
+        near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride);
+        near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride);
+        near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride);
     }
 
     if(cpi->common.last_frame_type != KEY_FRAME)
@@ -1709,14 +1714,14 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
         if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX;
 
         if(near_sad[4] != INT_MAX)
-            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, UINT_MAX);
+            near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride);
         if(near_sad[5] != INT_MAX)
-            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, UINT_MAX);
-        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, UINT_MAX);
+            near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride);
+        near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride);
         if(near_sad[6] != INT_MAX)
-            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, UINT_MAX);
+            near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride);
         if(near_sad[7] != INT_MAX)
-            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, UINT_MAX);
+            near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride);
     }
 
     if(cpi->common.last_frame_type != KEY_FRAME)
@@ -1920,8 +1925,8 @@ static void update_best_mode(BEST_MODE* best_mode, int this_rd,
                       (rd->distortion2-rd->distortion_uv));
 
     best_mode->rd = this_rd;
-    vpx_memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
-    vpx_memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
+    memcpy(&best_mode->mbmode, &x->e_mbd.mode_info_context->mbmi, sizeof(MB_MODE_INFO));
+    memcpy(&best_mode->partition, x->partition_info, sizeof(PARTITION_INFO));
 
     if ((this_mode == B_PRED) || (this_mode == SPLITMV))
     {
@@ -1983,9 +1988,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     best_mode.rd = INT_MAX;
     best_mode.yrd = INT_MAX;
     best_mode.intra_rd = INT_MAX;
-    vpx_memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
-    vpx_memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
-    vpx_memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
+    memset(mode_mv_sb, 0, sizeof(mode_mv_sb));
+    memset(&best_mode.mbmode, 0, sizeof(best_mode.mbmode));
+    memset(&best_mode.bmodes, 0, sizeof(best_mode.bmodes));
 
     /* Setup search priorities */
     get_reference_search_order(cpi, ref_frame_map);
@@ -2287,7 +2292,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                 mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
                 /* Further step/diamond searches as necessary */
-                n = 0;
                 further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
                 n = num00;
@@ -2554,8 +2558,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                                                intra_rd_penalty, cpi, x);
             if (this_rd < best_mode.rd || x->skip)
             {
-                /* Note index of best mode so far */
-                best_mode_index = mode_index;
                 *returnrate = rd.rate2;
                 *returndistortion = rd.distortion2;
                 update_best_mode(&best_mode, this_rd, &rd, other_cost, x);
@@ -2580,7 +2582,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
 
     /* macroblock modes */
-    vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
+    memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mode.mbmode, sizeof(MB_MODE_INFO));
 
     if (best_mode.mbmode.mode == B_PRED)
     {
@@ -2593,7 +2595,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         for (i = 0; i < 16; i++)
             xd->mode_info_context->bmi[i].mv.as_int = best_mode.bmodes[i].mv.as_int;
 
-        vpx_memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
+        memcpy(x->partition_info, &best_mode.partition, sizeof(PARTITION_INFO));
 
         x->e_mbd.mode_info_context->mbmi.mv.as_int =
                                       x->partition_info->bmi[15].mv.as_int;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c
index 37972e219a0..fdd22fceb6e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/segmentation.c
@@ -23,7 +23,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x)
     if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame))
     {
         /* Reset Gf useage monitors */
-        vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
         cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
     }
     else
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
index 4dc0d959221..ba8b0097710 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/temporal_filter.c
@@ -163,6 +163,8 @@ static int vp8_temporal_filter_find_matching_mb_c
     int pre = d->offset;
     int pre_stride = x->e_mbd.pre.y_stride;
 
+    (void)error_thresh;
+
     best_ref_mv1.as_int = 0;
     best_ref_mv1_full.as_mv.col = best_ref_mv1.as_mv.col >>3;
     best_ref_mv1_full.as_mv.row = best_ref_mv1.as_mv.row >>3;
@@ -236,12 +238,12 @@ static void vp8_temporal_filter_iterate_c
     int mb_rows = cpi->common.mb_rows;
     int mb_y_offset = 0;
     int mb_uv_offset = 0;
-    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
-    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED(16, unsigned int, accumulator[16*16 + 8*8 + 8*8]);
+    DECLARE_ALIGNED(16, unsigned short, count[16*16 + 8*8 + 8*8]);
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
     YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
     unsigned char *dst1, *dst2;
-    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED(16, unsigned char,  predictor[16*16 + 8*8 + 8*8]);
 
     /* Save input state */
     unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -272,8 +274,8 @@ static void vp8_temporal_filter_iterate_c
             int i, j, k;
             int stride;
 
-            vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
-            vpx_memset(count, 0, 384*sizeof(unsigned short));
+            memset(accumulator, 0, 384*sizeof(unsigned int));
+            memset(count, 0, 384*sizeof(unsigned short));
 
 #if ALT_REF_MC_ENABLED
             cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
@@ -500,7 +502,7 @@ void vp8_temporal_filter_prepare_c
     start_frame = distance + frames_to_blur_forward;
 
     /* Setup frame pointers, NULL indicates frame not included in filter */
-    vpx_memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
+    memset(cpi->frames, 0, max_frames*sizeof(YV12_BUFFER_CONFIG *));
     for (frame = 0; frame < frames_to_blur; frame++)
     {
         int which_buffer =  start_frame - frame;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
index 2dc8205278b..afd46fb2197 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/tokenize.c
@@ -421,7 +421,7 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 
 void init_context_counters(void)
 {
-    vpx_memset(context_counters, 0, sizeof(context_counters));
+    memset(context_counters, 0, sizeof(context_counters));
 }
 
 void print_context_counters()
@@ -596,13 +596,13 @@ void vp8_fix_contexts(MACROBLOCKD *x)
     /* Clear entropy contexts for Y2 blocks */
     if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV)
     {
-        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
-        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
+        memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES));
     }
     else
     {
-        vpx_memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
-        vpx_memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+        memset(x->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
+        memset(x->left_context, 0, sizeof(ENTROPY_CONTEXT_PLANES)-1);
     }
 
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c
deleted file mode 100644
index a4169b32f6a..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/vp8_asm_enc_offsets.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_config.h"
-#include "block.h"
-#include "vp8/common/blockd.h"
-#include "onyx_int.h"
-#include "treewriter.h"
-#include "tokenize.h"
-
-BEGIN
-
-/* regular quantize */
-DEFINE(vp8_block_coeff,                         offsetof(BLOCK, coeff));
-DEFINE(vp8_block_zbin,                          offsetof(BLOCK, zbin));
-DEFINE(vp8_block_round,                         offsetof(BLOCK, round));
-DEFINE(vp8_block_quant,                         offsetof(BLOCK, quant));
-DEFINE(vp8_block_quant_fast,                    offsetof(BLOCK, quant_fast));
-DEFINE(vp8_block_zbin_extra,                    offsetof(BLOCK, zbin_extra));
-DEFINE(vp8_block_zrun_zbin_boost,               offsetof(BLOCK, zrun_zbin_boost));
-DEFINE(vp8_block_quant_shift,                   offsetof(BLOCK, quant_shift));
-
-DEFINE(vp8_blockd_qcoeff,                       offsetof(BLOCKD, qcoeff));
-DEFINE(vp8_blockd_dequant,                      offsetof(BLOCKD, dequant));
-DEFINE(vp8_blockd_dqcoeff,                      offsetof(BLOCKD, dqcoeff));
-DEFINE(vp8_blockd_eob,                          offsetof(BLOCKD, eob));
-
-/* subtract */
-DEFINE(vp8_block_base_src,                      offsetof(BLOCK, base_src));
-DEFINE(vp8_block_src,                           offsetof(BLOCK, src));
-DEFINE(vp8_block_src_diff,                      offsetof(BLOCK, src_diff));
-DEFINE(vp8_block_src_stride,                    offsetof(BLOCK, src_stride));
-
-DEFINE(vp8_blockd_predictor,                    offsetof(BLOCKD, predictor));
-
-/* pack tokens */
-DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
-DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
-DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
-DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
-DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
-DEFINE(vp8_writer_buffer_end,                   offsetof(vp8_writer, buffer_end));
-DEFINE(vp8_writer_error,                        offsetof(vp8_writer, error));
-
-DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
-DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
-DEFINE(tokenextra_context_tree,                 offsetof(TOKENEXTRA, context_tree));
-DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
-DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
-
-DEFINE(vp8_extra_bit_struct_sz,                 sizeof(vp8_extra_bit_struct));
-
-DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
-DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
-
-DEFINE(vp8_extra_bit_struct_tree,               offsetof(vp8_extra_bit_struct, tree));
-DEFINE(vp8_extra_bit_struct_prob,               offsetof(vp8_extra_bit_struct, prob));
-DEFINE(vp8_extra_bit_struct_len,                offsetof(vp8_extra_bit_struct, Len));
-DEFINE(vp8_extra_bit_struct_base_val,           offsetof(vp8_extra_bit_struct, base_val));
-
-DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
-DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc ,                            offsetof(VP8_COMP, bc));
-DEFINE(vp8_writer_sz ,                          sizeof(vp8_writer));
-
-DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
-DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
-DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
-
-DEFINE(vp8_common_mb_rows,                      offsetof(VP8_COMMON, mb_rows));
-
-END
-
-/* add asserts for any offset that is not supported by assembly code
- * add asserts for any size that is not supported by assembly code
-
- * These are used in vp8cx_pack_tokens.  They are hard coded so if their sizes
- * change they will have to be adjusted.
- */
-
-#if HAVE_EDSP
-ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 8)
-ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 16)
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
index 3a4cf7ee792..101d646ef43 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -121,12 +121,12 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
         if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
         if (abs_sum_diff > sum_diff_thresh) {
           // Before returning to copy the block (i.e., apply no denoising),
-          // checK if we can still apply some (weaker) temporal filtering to
+          // check if we can still apply some (weaker) temporal filtering to
           // this block, that would otherwise not be denoised at all. Simplest
           // is to apply an additional adjustment to running_avg_y to bring it
           // closer to sig. The adjustment is capped by a maximum delta, and
           // chosen such that in most cases the resulting sum_diff will be
-          // within the accceptable range given by sum_diff_thresh.
+          // within the acceptable range given by sum_diff_thresh.
 
           // The delta is set by the excess of absolute pixel diff over the
           // threshold.
@@ -302,12 +302,12 @@ int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
         if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
         if (abs_sum_diff > sum_diff_thresh) {
           // Before returning to copy the block (i.e., apply no denoising),
-          // checK if we can still apply some (weaker) temporal filtering to
+          // check if we can still apply some (weaker) temporal filtering to
           // this block, that would otherwise not be denoised at all. Simplest
           // is to apply an additional adjustment to running_avg_y to bring it
           // closer to sig. The adjustment is capped by a maximum delta, and
           // chosen such that in most cases the resulting sum_diff will be
-          // within the accceptable range given by sum_diff_thresh.
+          // within the acceptable range given by sum_diff_thresh.
 
           // The delta is set by the excess of absolute pixel diff over the
           // threshold.
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c
index 291d21992fe..b4e92e04b22 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/x86/quantize_sse2.c
@@ -35,10 +35,10 @@
 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
     char eob = 0;
-    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *zbin_boost_ptr;
     short *qcoeff_ptr      = d->qcoeff;
-    DECLARE_ALIGNED_ARRAY(16, short, x, 16);
-    DECLARE_ALIGNED_ARRAY(16, short, y, 16);
+    DECLARE_ALIGNED(16, short, x[16]);
+    DECLARE_ALIGNED(16, short, y[16]);
 
     __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
     __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
@@ -55,7 +55,7 @@ void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
     __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
     __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
 
-    vpx_memset(qcoeff_ptr, 0, 32);
+    memset(qcoeff_ptr, 0, 32);
 
     /* Duplicate to all lanes. */
     zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
index 9b11c0da329..b4c814075c7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_common.mk
@@ -15,6 +15,7 @@ VP8_COMMON_SRCS-yes += common/onyxd.h
 VP8_COMMON_SRCS-yes += common/alloccommon.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
+VP8_COMMON_SRCS-yes += common/copy_c.c
 VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
 VP8_COMMON_SRCS-yes += common/dequantize.c
@@ -60,7 +61,6 @@ VP8_COMMON_SRCS-yes += common/quant_common.c
 VP8_COMMON_SRCS-yes += common/reconinter.c
 VP8_COMMON_SRCS-yes += common/reconintra.c
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
-VP8_COMMON_SRCS-yes += common/sad_c.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
 VP8_COMMON_SRCS-yes += common/variance_c.c
@@ -85,26 +85,23 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/iwalsh_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/sad_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/variance_impl_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/sad_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/sad_sse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/sad_ssse3.asm
+VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
 
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@@ -148,7 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/intra4x4_predict_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_sad16x16_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
@@ -170,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimpleverticaledge_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/mbloopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sad_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
index b1b079cb260..af9cc7320b9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c
@@ -10,7 +10,9 @@
 
 
 #include "./vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
@@ -37,6 +39,7 @@ struct vp8_extracfg
     vp8e_tuning                 tuning;
     unsigned int                cq_level;         /* constrained quality level */
     unsigned int                rc_max_intra_bitrate_pct;
+    unsigned int                screen_content_mode;
 
 };
 
@@ -62,6 +65,7 @@ static struct vp8_extracfg default_extracfg = {
   0,                          /* tuning*/
   10,                         /* cq_level */
   0,                          /* rc_max_intra_bitrate_pct */
+  0,                          /* screen_content_mode */
 };
 
 struct vpx_codec_alg_priv
@@ -79,6 +83,7 @@ struct vpx_codec_alg_priv
     /* pkt_list size depends on the maximum number of lagged frames allowed. */
     vpx_codec_pkt_list_decl(64) pkt_list;
     unsigned int                fixed_kf_cntr;
+    vpx_enc_frame_flags_t   control_frame_flags;
 };
 
 
@@ -194,6 +199,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
     RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
     RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
+    RANGE_CHECK_BOOL(vp8_cfg, screen_content_mode);
     if (finalize && (cfg->rc_end_usage == VPX_CQ || cfg->rc_end_usage == VPX_Q))
         RANGE_CHECK(vp8_cfg, cq_level,
                     cfg->rc_min_quantizer, cfg->rc_max_quantizer);
@@ -231,7 +237,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
         RANGE_CHECK_HI(cfg, ts_periodicity, 16);
 
         for (i=1; i<cfg->ts_number_layers; i++)
-            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1])
+            if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1] && 
+                cfg->rc_target_bitrate > 0)
                 ERROR("ts_target_bitrate entries are not strictly increasing");
 
         RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
@@ -360,9 +367,9 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     if (oxcf->number_of_layers > 1)
     {
         memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate,
-                          sizeof(cfg.ts_target_bitrate));
+                sizeof(cfg.ts_target_bitrate));
         memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator,
-                          sizeof(cfg.ts_rate_decimator));
+                sizeof(cfg.ts_rate_decimator));
         memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
     }
 
@@ -379,6 +386,8 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
         oxcf->mr_down_sampling_factor.den = mr_cfg->mr_down_sampling_factor.den;
         oxcf->mr_low_res_mode_info        = mr_cfg->mr_low_res_mode_info;
     }
+#else
+    (void)mr_cfg;
 #endif
 
     oxcf->cpu_used               = vp8_cfg.cpu_used;
@@ -397,6 +406,8 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
 
     oxcf->tuning                 = vp8_cfg.tuning;
 
+    oxcf->screen_content_mode    = vp8_cfg.screen_content_mode;
+
     /*
         printf("Current VP8 Settings: \n");
         printf("target_bandwidth: %d\n", oxcf->target_bandwidth);
@@ -438,9 +449,14 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t       *ctx,
 {
     vpx_codec_err_t res;
 
-    if (((cfg->g_w != ctx->cfg.g_w) || (cfg->g_h != ctx->cfg.g_h))
-        && (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS))
-        ERROR("Cannot change width or height after initialization");
+    if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
+    {
+        if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+            ERROR("Cannot change width or height after initialization");
+        if ((ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+            (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+            ERROR("Cannot increast width or height larger than their initial values");
+    }
 
     /* Prevent increasing lag_in_frames. This check is stricter than it needs
      * to be -- the limit is not increasing past the first lag_in_frames
@@ -586,6 +602,15 @@ static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx,
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
+                                               va_list args)
+{
+  struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
+  extra_cfg.screen_content_mode =
+      CAST(VP8E_SET_SCREEN_CONTENT_MODE, args);
+  return update_extracfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                         void **mem_loc)
 {
@@ -612,6 +637,9 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
         *mem_loc = (void *)shared_mem_loc;
         res = VPX_CODEC_OK;
     }
+#else
+    (void)cfg;
+    (void)mem_loc;
 #endif
     return res;
 }
@@ -623,6 +651,8 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
 
 
     vp8_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
 
     if (!ctx->priv)
     {
@@ -768,27 +798,9 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t  *ctx,
     }
 }
 
-
-static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
-                                   const vpx_image_t     *img,
-                                   vpx_codec_pts_t        pts,
-                                   unsigned long          duration,
-                                   vpx_enc_frame_flags_t  flags,
-                                   unsigned long          deadline)
+static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
+                                                int flags)
 {
-    vpx_codec_err_t res = VPX_CODEC_OK;
-
-    if (!ctx->cfg.rc_target_bitrate)
-        return res;
-
-    if (img)
-        res = validate_img(ctx, img);
-
-    if (!res)
-        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
-
-    pick_quickcompress_mode(ctx, duration, deadline);
-    vpx_codec_pkt_list_init(&ctx->pkt_list);
 
     /* Handle Flags */
     if (((flags & VP8_EFLAG_NO_UPD_GF) && (flags & VP8_EFLAG_FORCE_GF))
@@ -838,6 +850,39 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
         vp8_update_entropy(ctx->cpi, 0);
     }
 
+    return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
+                                   const vpx_image_t     *img,
+                                   vpx_codec_pts_t        pts,
+                                   unsigned long          duration,
+                                   vpx_enc_frame_flags_t  flags,
+                                   unsigned long          deadline)
+{
+    vpx_codec_err_t res = VPX_CODEC_OK;
+
+    if (!ctx->cfg.rc_target_bitrate)
+        return res;
+
+    if (img)
+        res = validate_img(ctx, img);
+
+    if (!res)
+        res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
+
+    pick_quickcompress_mode(ctx, duration, deadline);
+    vpx_codec_pkt_list_init(&ctx->pkt_list);
+
+    // If no flags are set in the encode call, then use the frame flags as
+    // defined via the control function: vp8e_set_frame_flags.
+    if (!flags) {
+        flags = ctx->control_frame_flags;
+    }
+    ctx->control_frame_flags = 0;
+
+    res = set_reference_and_update(ctx, flags);
+
     /* Handle fixed keyframe intervals */
     if (ctx->cfg.kf_mode == VPX_KF_AUTO
         && ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist)
@@ -1140,6 +1185,25 @@ static vpx_codec_err_t vp8e_use_reference(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t vp8e_set_frame_flags(vpx_codec_alg_priv_t *ctx,
+                                            va_list args)
+{
+    int frame_flags = va_arg(args, int);
+    ctx->control_frame_flags = frame_flags;
+    return set_reference_and_update(ctx, frame_flags);
+}
+
+static vpx_codec_err_t vp8e_set_temporal_layer_id(vpx_codec_alg_priv_t *ctx,
+                                                  va_list args)
+{
+    int layer_id = va_arg(args, int);
+    if (layer_id < 0 || layer_id >= (int)ctx->cfg.ts_number_layers) {
+      return VPX_CODEC_INVALID_PARAM;
+    }
+    ctx->cpi->temporal_layer_id = layer_id;
+    return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8e_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         va_list args)
 {
@@ -1214,6 +1278,8 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
     {VP8E_UPD_ENTROPY,                  vp8e_update_entropy},
     {VP8E_UPD_REFERENCE,                vp8e_update_reference},
     {VP8E_USE_REFERENCE,                vp8e_use_reference},
+    {VP8E_SET_FRAME_FLAGS,              vp8e_set_frame_flags},
+    {VP8E_SET_TEMPORAL_LAYER_ID,        vp8e_set_temporal_layer_id},
     {VP8E_SET_ROI_MAP,                  vp8e_set_roi_map},
     {VP8E_SET_ACTIVEMAP,                vp8e_set_activemap},
     {VP8E_SET_SCALEMODE,                vp8e_set_scalemode},
@@ -1231,6 +1297,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
     {VP8E_SET_TUNING,                   set_tuning},
     {VP8E_SET_CQ_LEVEL,                 set_cq_level},
     {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_rc_max_intra_bitrate_pct},
+    {VP8E_SET_SCREEN_CONTENT_MODE,      set_screen_content_mode},
     { -1, NULL},
 };
 
@@ -1264,10 +1331,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         30,                 /* rc_resize_up_thresold */
 
         VPX_VBR,            /* rc_end_usage */
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
         {0},                /* rc_twopass_stats_in */
         {0},                /* rc_firstpass_mb_stats_in */
-#endif
         256,                /* rc_target_bandwidth */
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
@@ -1287,9 +1352,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
         0,                  /* kf_min_dist */
         128,                /* kf_max_dist */
 
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
-        "vp8.fpf"           /* first pass filename */
-#endif
         VPX_SS_DEFAULT_LAYERS, /* ss_number_layers */
         {0},
         {0},                /* ss_target_bitrate */
@@ -1320,12 +1382,13 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
         NULL,    /* vpx_codec_get_si_fn_t     get_si; */
         NULL,    /* vpx_codec_decode_fn_t     decode; */
         NULL,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NULL,    /* vpx_codec_set_fb_fn_t     set_fb_fn; */
     },
     {
         1,                  /* 1 cfg map */
-        vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    peek_si; */
+        vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
         vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
-        vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   frame_get; */
+        vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
         vp8e_set_config,
         NULL,
         vp8e_get_preview,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
index 5aa274dbb0b..72e4770c008 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_dx_iface.c
@@ -11,7 +11,9 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -60,7 +62,6 @@ struct vpx_codec_alg_priv
     vpx_decrypt_cb          decrypt_cb;
     void                    *decrypt_state;
     vpx_image_t             img;
-    int                     flushed;
     int                     img_setup;
     struct frame_buffers    yv12_frame_buffers;
     void                    *user_priv;
@@ -75,6 +76,7 @@ static unsigned long vp8_priv_sz(const vpx_codec_dec_cfg_t *si, vpx_codec_flags_
      * known)
      */
     (void)si;
+    (void)flags;
     return sizeof(vpx_codec_alg_priv_t);
 }
 
@@ -89,7 +91,6 @@ static void vp8_init_ctx(vpx_codec_ctx_t *ctx)
     priv->si.sz = sizeof(priv->si);
     priv->decrypt_cb = NULL;
     priv->decrypt_state = NULL;
-    priv->flushed = 0;
 
     if (ctx->config.dec)
     {
@@ -107,6 +108,8 @@ static vpx_codec_err_t vp8_init(vpx_codec_ctx_t *ctx,
     (void) data;
 
     vp8_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
 
     /* This function only allocates space for the vpx_codec_alg_priv_t
      * structure. More memory may be required at the time the stream
@@ -189,7 +192,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
 
             /* vet via sync code */
             if (clear[3] != 0x9d || clear[4] != 0x01 || clear[5] != 0x2a)
-                res = VPX_CODEC_UNSUP_BITSTREAM;
+                return VPX_CODEC_UNSUP_BITSTREAM;
 
             si->w = (clear[6] | (clear[7] << 8)) & 0x3fff;
             si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
@@ -287,8 +290,8 @@ update_fragments(vpx_codec_alg_priv_t  *ctx,
     if (ctx->fragments.count == 0)
     {
         /* New frame, reset fragment pointers and sizes */
-        vpx_memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
-        vpx_memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
+        memset((void*)ctx->fragments.ptrs, 0, sizeof(ctx->fragments.ptrs));
+        memset(ctx->fragments.sizes, 0, sizeof(ctx->fragments.sizes));
     }
     if (ctx->fragments.enabled && !(data == NULL && data_sz == 0))
     {
@@ -307,6 +310,11 @@ update_fragments(vpx_codec_alg_priv_t  *ctx,
         return 0;
     }
 
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
+    }
+
     if (!ctx->fragments.enabled)
     {
         ctx->fragments.ptrs[0] = data;
@@ -327,14 +335,11 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     unsigned int resolution_change = 0;
     unsigned int w, h;
 
-    if (data == NULL && data_sz == 0) {
-      ctx->flushed = 1;
-      return VPX_CODEC_OK;
+    if (!ctx->fragments.enabled && (data == NULL && data_sz == 0))
+    {
+        return 0;
     }
 
-    /* Reset flushed when receiving a valid frame */
-    ctx->flushed = 0;
-
     /* Update the input fragment data */
     if(update_fragments(ctx, data, data_sz, &res) <= 0)
         return res;
@@ -401,7 +406,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
     if (!res)
     {
         VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
-        if(resolution_change)
+        if (resolution_change)
         {
             VP8_COMMON *const pc = & pbi->common;
             MACROBLOCKD *const xd  = & pbi->mb;
@@ -647,6 +652,8 @@ static vpx_codec_err_t vp8_set_postproc(vpx_codec_alg_priv_t *ctx,
         return VPX_CODEC_INVALID_PARAM;
 
 #else
+    (void)ctx;
+    (void)args;
     return VPX_CODEC_INCAPABLE;
 #endif
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
index a0dbdcfa92f..5e4ef05987e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx.mk
@@ -75,7 +75,6 @@ VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
-VP8_CX_SRCS-yes += encoder/vp8_asm_enc_offsets.c
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
@@ -107,6 +106,3 @@ VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 endif
 
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
-
-$(eval $(call asm_offsets_template,\
-         vp8_asm_enc_offsets.asm, $(VP8_PREFIX)encoder/vp8_asm_enc_offsets.c))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
index ed19fd4e1d9..05003017982 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8cx_arm.mk
@@ -14,30 +14,17 @@ VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk
 #File list for arm
 # encoder
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
-
-#File list for edsp
-# encoder
-VP8_CX_SRCS-$(HAVE_EDSP) += encoder/arm/boolhuff_arm.c
-VP8_CX_SRCS_REMOVE-$(HAVE_EDSP)  += encoder/boolhuff.c
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/boolhuff_armv5te$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_mbrow_armv5$(ASM)
-VP8_CX_SRCS-$(HAVE_EDSP)  += encoder/arm/armv5te/vp8_packtokens_partitions_armv5$(ASM)
 
 #File list for media
 # encoder
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_subtract_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
 
 #File list for neon
 # encoder
-VP8_CX_SRCS-$(HAVE_NEON_ASM)  += encoder/arm/neon/fastquantizeb_neon$(ASM)
-
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
+VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_mse16x16_neon.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
new file mode 100644
index 00000000000..dd569d348fb
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.c
@@ -0,0 +1,390 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const int16_t *filter_x, int x_step_q4,
+                               const int16_t *filter_y, int y_step_q4,
+                               int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+        int16x4_t dsrc0,
+        int16x4_t dsrc1,
+        int16x4_t dsrc2,
+        int16x4_t dsrc3,
+        int16x4_t dsrc4,
+        int16x4_t dsrc5,
+        int16x4_t dsrc6,
+        int16x4_t dsrc7,
+        int16x8_t q0s16) {
+    int32x4_t qdst;
+    int16x4_t d0s16, d1s16;
+
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+
+    qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+    qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+    qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+    qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+    qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+    qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+    qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+    qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+    return qdst;
+}
+
+void vp9_convolve8_avg_horiz_neon(
+        uint8_t *src,
+        ptrdiff_t src_stride,
+        uint8_t *dst,
+        ptrdiff_t dst_stride,
+        const int16_t *filter_x,
+        int x_step_q4,
+        const int16_t *filter_y,  // unused
+        int y_step_q4,            // unused
+        int w,
+        int h) {
+    int width;
+    uint8_t *s, *d;
+    uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+    uint32x2_t d2u32, d3u32, d6u32, d7u32, d28u32, d29u32, d30u32, d31u32;
+    uint8x16_t q1u8, q3u8, q12u8, q13u8, q14u8, q15u8;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16;
+    uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+    int16x8_t q0s16;
+    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+    int32x4_t q1s32, q2s32, q14s32, q15s32;
+    uint16x8x2_t q0x2u16;
+    uint8x8x2_t d0x2u8, d1x2u8;
+    uint32x2x2_t d0x2u32;
+    uint16x4x2_t d0x2u16, d1x2u16;
+    uint32x4x2_t q0x2u32;
+
+    if (x_step_q4 != 16) {
+        vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+                                  filter_x, x_step_q4,
+                                  filter_y, y_step_q4, w, h);
+        return;
+    }
+
+    q0s16 = vld1q_s16(filter_x);
+
+    src -= 3;  // adjust for taps
+    for (; h > 0; h -= 4) {  // loop_horiz_v
+        s = src;
+        d24u8 = vld1_u8(s);
+        s += src_stride;
+        d25u8 = vld1_u8(s);
+        s += src_stride;
+        d26u8 = vld1_u8(s);
+        s += src_stride;
+        d27u8 = vld1_u8(s);
+
+        q12u8 = vcombine_u8(d24u8, d25u8);
+        q13u8 = vcombine_u8(d26u8, d27u8);
+
+        q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                            vreinterpretq_u16_u8(q13u8));
+        d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+        d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+        d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+        d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+        d0x2u8 = vtrn_u8(d24u8, d25u8);
+        d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+        __builtin_prefetch(src + src_stride * 4);
+        __builtin_prefetch(src + src_stride * 5);
+
+        q8u16 = vmovl_u8(d0x2u8.val[0]);
+        q9u16 = vmovl_u8(d0x2u8.val[1]);
+        q10u16 = vmovl_u8(d1x2u8.val[0]);
+        q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+        src += 7;
+        d16u16 = vget_low_u16(q8u16);
+        d17u16 = vget_high_u16(q8u16);
+        d18u16 = vget_low_u16(q9u16);
+        d19u16 = vget_high_u16(q9u16);
+        q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+        q9u16 = vcombine_u16(d17u16, d19u16);
+
+        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+        for (width = w;
+             width > 0;
+             width -= 4, src += 4, dst += 4) {  // loop_horiz
+            s = src;
+            d28u32 = vld1_dup_u32((const uint32_t *)s);
+            s += src_stride;
+            d29u32 = vld1_dup_u32((const uint32_t *)s);
+            s += src_stride;
+            d31u32 = vld1_dup_u32((const uint32_t *)s);
+            s += src_stride;
+            d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+            __builtin_prefetch(src + 64);
+
+            d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                               vreinterpret_u16_u32(d31u32));
+            d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                               vreinterpret_u16_u32(d30u32));
+            d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                             vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+            d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                             vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+            __builtin_prefetch(src + 64 + src_stride);
+
+            q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+            q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+            q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                                vreinterpretq_u32_u8(q15u8));
+
+            d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+            d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+            q12u16 = vmovl_u8(d28u8);
+            q13u16 = vmovl_u8(d29u8);
+
+            __builtin_prefetch(src + 64 + src_stride * 2);
+
+            d = dst;
+            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+            d += dst_stride;
+            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+            d += dst_stride;
+            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+            d += dst_stride;
+            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+
+            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+            d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+            d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+            d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                                    d18s16, d19s16, d23s16, d24s16, q0s16);
+            q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                                    d19s16, d23s16, d24s16, d26s16, q0s16);
+            q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                                    d23s16, d24s16, d26s16, d27s16, q0s16);
+            q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                                    d24s16, d26s16, d27s16, d25s16, q0s16);
+
+            __builtin_prefetch(src + 64 + src_stride * 3);
+
+            d2u16 = vqrshrun_n_s32(q1s32, 7);
+            d3u16 = vqrshrun_n_s32(q2s32, 7);
+            d4u16 = vqrshrun_n_s32(q14s32, 7);
+            d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+            q1u16 = vcombine_u16(d2u16, d3u16);
+            q2u16 = vcombine_u16(d4u16, d5u16);
+
+            d2u8 = vqmovn_u16(q1u16);
+            d3u8 = vqmovn_u16(q2u16);
+
+            d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                               vreinterpret_u16_u8(d3u8));
+            d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                               vreinterpret_u32_u16(d0x2u16.val[1]));
+            d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                             vreinterpret_u8_u32(d0x2u32.val[1]));
+
+            q1u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+            q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+            q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+            d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+            d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+            d = dst;
+            vst1_lane_u32((uint32_t *)d, d2u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d2u32, 1);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+            q8u16 = q9u16;
+            d20s16 = d23s16;
+            q11u16 = q12u16;
+            q9u16 = q13u16;
+            d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        }
+        src += src_stride * 4 - w - 7;
+        dst += dst_stride * 4 - w;
+    }
+    return;
+}
+
+void vp9_convolve8_avg_vert_neon(
+        uint8_t *src,
+        ptrdiff_t src_stride,
+        uint8_t *dst,
+        ptrdiff_t dst_stride,
+        const int16_t *filter_x,  // unused
+        int x_step_q4,            // unused
+        const int16_t *filter_y,
+        int y_step_q4,
+        int w,
+        int h) {
+    int height;
+    uint8_t *s, *d;
+    uint8x8_t d2u8, d3u8;
+    uint32x2_t d2u32, d3u32, d6u32, d7u32;
+    uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+    uint8x16_t q1u8, q3u8;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16;
+    uint16x4_t d2u16, d3u16, d4u16, d5u16;
+    int16x8_t q0s16;
+    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+    int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+    if (y_step_q4 != 16) {
+        vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+                                 filter_x, x_step_q4,
+                                 filter_y, y_step_q4, w, h);
+        return;
+    }
+
+    src -= src_stride * 3;
+    q0s16 = vld1q_s16(filter_y);
+    for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+        s = src;
+        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+        s += src_stride;
+        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+        s += src_stride;
+        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+        s += src_stride;
+        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+        s += src_stride;
+        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+        s += src_stride;
+        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+        s += src_stride;
+        d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+        s += src_stride;
+
+        q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+        q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+        q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+        q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d = dst;
+        for (height = h; height > 0; height -= 4) {  // loop_vert
+            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+            s += src_stride;
+            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+            s += src_stride;
+            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+            s += src_stride;
+            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+            s += src_stride;
+
+            q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+            q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 0);
+            d += dst_stride;
+            d6u32 = vld1_lane_u32((const uint32_t *)d, d6u32, 1);
+            d += dst_stride;
+            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 0);
+            d += dst_stride;
+            d7u32 = vld1_lane_u32((const uint32_t *)d, d7u32, 1);
+            d -= dst_stride * 3;
+
+            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+            d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+            d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+            __builtin_prefetch(s);
+            __builtin_prefetch(s + src_stride);
+            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                                    d20s16, d21s16, d22s16, d24s16, q0s16);
+            __builtin_prefetch(s + src_stride * 2);
+            __builtin_prefetch(s + src_stride * 3);
+            q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                                    d21s16, d22s16, d24s16, d26s16, q0s16);
+            __builtin_prefetch(d);
+            __builtin_prefetch(d + dst_stride);
+            q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                                    d22s16, d24s16, d26s16, d27s16, q0s16);
+            __builtin_prefetch(d + dst_stride * 2);
+            __builtin_prefetch(d + dst_stride * 3);
+            q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                                    d24s16, d26s16, d27s16, d25s16, q0s16);
+
+            d2u16 = vqrshrun_n_s32(q1s32, 7);
+            d3u16 = vqrshrun_n_s32(q2s32, 7);
+            d4u16 = vqrshrun_n_s32(q14s32, 7);
+            d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+            q1u16 = vcombine_u16(d2u16, d3u16);
+            q2u16 = vcombine_u16(d4u16, d5u16);
+
+            d2u8 = vqmovn_u16(q1u16);
+            d3u8 = vqmovn_u16(q2u16);
+
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            q3u8 = vreinterpretq_u8_u32(vcombine_u32(d6u32, d7u32));
+
+            q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+            d2u32 = vreinterpret_u32_u8(vget_low_u8(q1u8));
+            d3u32 = vreinterpret_u32_u8(vget_high_u8(q1u8));
+
+            vst1_lane_u32((uint32_t *)d, d2u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d2u32, 1);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 1);
+            d += dst_stride;
+
+            q8u16 = q10u16;
+            d18s16 = d22s16;
+            d19s16 = d24s16;
+            q10u16 = q13u16;
+            d22s16 = d25s16;
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
index 6b20cb9bf2a..4d85846f0a9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon_asm.asm
@@ -78,7 +78,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz_v
+vp9_convolve8_avg_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +101,7 @@ loop_horiz_v
 
     add             r0, r0, #3
 
-loop_horiz
+vp9_convolve8_avg_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -170,14 +170,14 @@ loop_horiz
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
+    bgt             vp9_convolve8_avg_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt vp9_convolve8_avg_loop_horiz_v
 
     pop             {r4-r10, pc}
 
@@ -203,7 +203,7 @@ loop_horiz
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-loop_vert_h
+vp9_convolve8_avg_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -223,7 +223,7 @@ loop_vert_h
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-loop_vert
+vp9_convolve8_avg_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -288,13 +288,13 @@ loop_vert
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
+    bgt             vp9_convolve8_avg_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    bgt             vp9_convolve8_avg_loop_vert_h
 
     pop             {r4-r8, pc}
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c
new file mode 100644
index 00000000000..5c555c45882
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.c
@@ -0,0 +1,357 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h);
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4,
+                           int w, int h);
+
+static INLINE int32x4_t MULTIPLY_BY_Q0(
+        int16x4_t dsrc0,
+        int16x4_t dsrc1,
+        int16x4_t dsrc2,
+        int16x4_t dsrc3,
+        int16x4_t dsrc4,
+        int16x4_t dsrc5,
+        int16x4_t dsrc6,
+        int16x4_t dsrc7,
+        int16x8_t q0s16) {
+    int32x4_t qdst;
+    int16x4_t d0s16, d1s16;
+
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+
+    qdst = vmull_lane_s16(dsrc0, d0s16, 0);
+    qdst = vmlal_lane_s16(qdst, dsrc1, d0s16, 1);
+    qdst = vmlal_lane_s16(qdst, dsrc2, d0s16, 2);
+    qdst = vmlal_lane_s16(qdst, dsrc3, d0s16, 3);
+    qdst = vmlal_lane_s16(qdst, dsrc4, d1s16, 0);
+    qdst = vmlal_lane_s16(qdst, dsrc5, d1s16, 1);
+    qdst = vmlal_lane_s16(qdst, dsrc6, d1s16, 2);
+    qdst = vmlal_lane_s16(qdst, dsrc7, d1s16, 3);
+    return qdst;
+}
+
+void vp9_convolve8_horiz_neon(
+        uint8_t *src,
+        ptrdiff_t src_stride,
+        uint8_t *dst,
+        ptrdiff_t dst_stride,
+        const int16_t *filter_x,
+        int x_step_q4,
+        const int16_t *filter_y,  // unused
+        int y_step_q4,            // unused
+        int w,
+        int h) {
+    int width;
+    uint8_t *s, *d, *psrc, *pdst;
+    uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
+    uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
+    uint8x16_t q12u8, q13u8, q14u8, q15u8;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16;
+    uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
+    int16x8_t q0s16;
+    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+    int32x4_t q1s32, q2s32, q14s32, q15s32;
+    uint16x8x2_t q0x2u16;
+    uint8x8x2_t d0x2u8, d1x2u8;
+    uint32x2x2_t d0x2u32;
+    uint16x4x2_t d0x2u16, d1x2u16;
+    uint32x4x2_t q0x2u32;
+
+    if (x_step_q4 != 16) {
+        vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4,
+                              filter_y, y_step_q4, w, h);
+        return;
+    }
+
+    q0s16 = vld1q_s16(filter_x);
+
+    src -= 3;  // adjust for taps
+    for (; h > 0; h -= 4,
+        src += src_stride * 4,
+        dst += dst_stride * 4) {  // loop_horiz_v
+        s = src;
+        d24u8 = vld1_u8(s);
+        s += src_stride;
+        d25u8 = vld1_u8(s);
+        s += src_stride;
+        d26u8 = vld1_u8(s);
+        s += src_stride;
+        d27u8 = vld1_u8(s);
+
+        q12u8 = vcombine_u8(d24u8, d25u8);
+        q13u8 = vcombine_u8(d26u8, d27u8);
+
+        q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8),
+                            vreinterpretq_u16_u8(q13u8));
+        d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
+        d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
+        d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
+        d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
+        d0x2u8 = vtrn_u8(d24u8, d25u8);
+        d1x2u8 = vtrn_u8(d26u8, d27u8);
+
+        __builtin_prefetch(src + src_stride * 4);
+        __builtin_prefetch(src + src_stride * 5);
+        __builtin_prefetch(src + src_stride * 6);
+
+        q8u16  = vmovl_u8(d0x2u8.val[0]);
+        q9u16  = vmovl_u8(d0x2u8.val[1]);
+        q10u16 = vmovl_u8(d1x2u8.val[0]);
+        q11u16 = vmovl_u8(d1x2u8.val[1]);
+
+        d16u16 = vget_low_u16(q8u16);
+        d17u16 = vget_high_u16(q8u16);
+        d18u16 = vget_low_u16(q9u16);
+        d19u16 = vget_high_u16(q9u16);
+        q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
+        q9u16 = vcombine_u16(d17u16, d19u16);
+
+        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
+        for (width = w, psrc = src + 7, pdst = dst;
+             width > 0;
+             width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
+            s = psrc;
+            d28u32 = vld1_dup_u32((const uint32_t *)s);
+            s += src_stride;
+            d29u32 = vld1_dup_u32((const uint32_t *)s);
+            s += src_stride;
+            d31u32 = vld1_dup_u32((const uint32_t *)s);
+            s += src_stride;
+            d30u32 = vld1_dup_u32((const uint32_t *)s);
+
+            __builtin_prefetch(psrc + 64);
+
+            d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32),
+                               vreinterpret_u16_u32(d31u32));
+            d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32),
+                               vreinterpret_u16_u32(d30u32));
+            d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
+                             vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
+            d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
+                             vreinterpret_u8_u16(d1x2u16.val[1]));  // d30
+
+            __builtin_prefetch(psrc + 64 + src_stride);
+
+            q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
+            q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
+            q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8),
+                                vreinterpretq_u32_u8(q15u8));
+
+            d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
+            d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
+            q12u16 = vmovl_u8(d28u8);
+            q13u16 = vmovl_u8(d29u8);
+
+            __builtin_prefetch(psrc + 64 + src_stride * 2);
+
+            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+            d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+            d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+            d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16,
+                                    d18s16, d19s16, d23s16, d24s16, q0s16);
+            q2s32  = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16,
+                                    d19s16, d23s16, d24s16, d26s16, q0s16);
+            q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16,
+                                    d23s16, d24s16, d26s16, d27s16, q0s16);
+            q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16,
+                                    d24s16, d26s16, d27s16, d25s16, q0s16);
+
+            __builtin_prefetch(psrc + 60 + src_stride * 3);
+
+            d2u16 = vqrshrun_n_s32(q1s32, 7);
+            d3u16 = vqrshrun_n_s32(q2s32, 7);
+            d4u16 = vqrshrun_n_s32(q14s32, 7);
+            d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+            q1u16 = vcombine_u16(d2u16, d3u16);
+            q2u16 = vcombine_u16(d4u16, d5u16);
+
+            d2u8 = vqmovn_u16(q1u16);
+            d3u8 = vqmovn_u16(q2u16);
+
+            d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8),
+                               vreinterpret_u16_u8(d3u8));
+            d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
+                               vreinterpret_u32_u16(d0x2u16.val[1]));
+            d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
+                             vreinterpret_u8_u32(d0x2u32.val[1]));
+
+            d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
+            d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);
+
+            d = pdst;
+            vst1_lane_u32((uint32_t *)d, d2u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d2u32, 1);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 1);
+
+            q8u16 = q9u16;
+            d20s16 = d23s16;
+            q11u16 = q12u16;
+            q9u16 = q13u16;
+            d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        }
+    }
+    return;
+}
+
+void vp9_convolve8_vert_neon(
+        uint8_t *src,
+        ptrdiff_t src_stride,
+        uint8_t *dst,
+        ptrdiff_t dst_stride,
+        const int16_t *filter_x,  // unused
+        int x_step_q4,            // unused
+        const int16_t *filter_y,
+        int y_step_q4,
+        int w,
+        int h) {
+    int height;
+    uint8_t *s, *d;
+    uint32x2_t d2u32, d3u32;
+    uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16;
+    uint16x4_t d2u16, d3u16, d4u16, d5u16;
+    int16x8_t q0s16;
+    uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
+    int32x4_t q1s32, q2s32, q14s32, q15s32;
+
+    if (y_step_q4 != 16) {
+        vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4,
+                             filter_y, y_step_q4, w, h);
+        return;
+    }
+
+    src -= src_stride * 3;
+    q0s16 = vld1q_s16(filter_y);
+    for (; w > 0; w -= 4, src += 4, dst += 4) {  // loop_vert_h
+        s = src;
+        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0);
+        s += src_stride;
+        d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1);
+        s += src_stride;
+        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0);
+        s += src_stride;
+        d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1);
+        s += src_stride;
+        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0);
+        s += src_stride;
+        d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1);
+        s += src_stride;
+        d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0);
+        s += src_stride;
+
+        q8u16  = vmovl_u8(vreinterpret_u8_u32(d16u32));
+        q9u16  = vmovl_u8(vreinterpret_u8_u32(d18u32));
+        q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32));
+        q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32));
+
+        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
+        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d = dst;
+        for (height = h; height > 0; height -= 4) {  // loop_vert
+            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0);
+            s += src_stride;
+            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0);
+            s += src_stride;
+            d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1);
+            s += src_stride;
+            d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1);
+            s += src_stride;
+
+            q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32));
+            q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32));
+
+            d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
+            d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
+            d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
+            d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
+            d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+            d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+            d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+            d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+
+            __builtin_prefetch(d);
+            __builtin_prefetch(d + dst_stride);
+            q1s32  = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16,
+                                    d20s16, d21s16, d22s16, d24s16, q0s16);
+            __builtin_prefetch(d + dst_stride * 2);
+            __builtin_prefetch(d + dst_stride * 3);
+            q2s32  = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16,
+                                    d21s16, d22s16, d24s16, d26s16, q0s16);
+            __builtin_prefetch(s);
+            __builtin_prefetch(s + src_stride);
+            q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16,
+                                    d22s16, d24s16, d26s16, d27s16, q0s16);
+            __builtin_prefetch(s + src_stride * 2);
+            __builtin_prefetch(s + src_stride * 3);
+            q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16,
+                                    d24s16, d26s16, d27s16, d25s16, q0s16);
+
+            d2u16 = vqrshrun_n_s32(q1s32, 7);
+            d3u16 = vqrshrun_n_s32(q2s32, 7);
+            d4u16 = vqrshrun_n_s32(q14s32, 7);
+            d5u16 = vqrshrun_n_s32(q15s32, 7);
+
+            q1u16 = vcombine_u16(d2u16, d3u16);
+            q2u16 = vcombine_u16(d4u16, d5u16);
+
+            d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16));
+            d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16));
+
+            vst1_lane_u32((uint32_t *)d, d2u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d2u32, 1);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 0);
+            d += dst_stride;
+            vst1_lane_u32((uint32_t *)d, d3u32, 1);
+            d += dst_stride;
+
+            q8u16 = q10u16;
+            d18s16 = d22s16;
+            d19s16 = d24s16;
+            q10u16 = q13u16;
+            d22s16 = d25s16;
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
index 45258454cab..184c3ad679c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve8_neon_asm.asm
@@ -78,7 +78,7 @@
 
     mov             r10, r6                 ; w loop counter
 
-loop_horiz_v
+vp9_convolve8_loop_horiz_v
     vld1.8          {d24}, [r0], r1
     vld1.8          {d25}, [r0], r1
     vld1.8          {d26}, [r0], r1
@@ -101,7 +101,7 @@ loop_horiz_v
 
     add             r0, r0, #3
 
-loop_horiz
+vp9_convolve8_loop_horiz
     add             r5, r0, #64
 
     vld1.32         {d28[]}, [r0], r1
@@ -159,14 +159,14 @@ loop_horiz
     vmov            q9,  q13
 
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_horiz
+    bgt             vp9_convolve8_loop_horiz
 
     ; outer loop
     mov             r6, r10                 ; restore w counter
     add             r0, r0, r9              ; src += src_stride * 4 - w
     add             r2, r2, r12             ; dst += dst_stride * 4 - w
     subs            r7, r7, #4              ; h -= 4
-    bgt loop_horiz_v
+    bgt vp9_convolve8_loop_horiz_v
 
     pop             {r4-r10, pc}
 
@@ -192,7 +192,7 @@ loop_horiz
     lsl             r1, r1, #1
     lsl             r3, r3, #1
 
-loop_vert_h
+vp9_convolve8_loop_vert_h
     mov             r4, r0
     add             r7, r0, r1, asr #1
     mov             r5, r2
@@ -212,7 +212,7 @@ loop_vert_h
     vmovl.u8        q10, d20
     vmovl.u8        q11, d22
 
-loop_vert
+vp9_convolve8_loop_vert
     ; always process a 4x4 block at a time
     vld1.u32        {d24[0]}, [r7], r1
     vld1.u32        {d26[0]}, [r4], r1
@@ -266,13 +266,13 @@ loop_vert
     vmov            d22, d25
 
     subs            r12, r12, #4            ; h -= 4
-    bgt             loop_vert
+    bgt             vp9_convolve8_loop_vert
 
     ; outer loop
     add             r0, r0, #4
     add             r2, r2, #4
     subs            r6, r6, #4              ; w -= 4
-    bgt             loop_vert_h
+    bgt             vp9_convolve8_loop_vert_h
 
     pop             {r4-r8, pc}
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon.c
new file mode 100644
index 00000000000..3a3db353e8b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon.c
@@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve_avg_neon(
+        const uint8_t *src,    // r0
+        ptrdiff_t src_stride,  // r1
+        uint8_t *dst,          // r2
+        ptrdiff_t dst_stride,  // r3
+        const int16_t *filter_x,
+        int filter_x_stride,
+        const int16_t *filter_y,
+        int filter_y_stride,
+        int w,
+        int h) {
+    uint8_t *d;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint32x2_t d0u32, d2u32;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8;
+    (void)filter_x;  (void)filter_x_stride;
+    (void)filter_y;  (void)filter_y_stride;
+
+    d = dst;
+    if (w > 32) {  // avg64
+        for (; h > 0; h -= 1) {
+            q0u8  = vld1q_u8(src);
+            q1u8  = vld1q_u8(src + 16);
+            q2u8  = vld1q_u8(src + 32);
+            q3u8  = vld1q_u8(src + 48);
+            src += src_stride;
+            q8u8  = vld1q_u8(d);
+            q9u8  = vld1q_u8(d + 16);
+            q10u8 = vld1q_u8(d + 32);
+            q11u8 = vld1q_u8(d + 48);
+            d += dst_stride;
+
+            q0u8 = vrhaddq_u8(q0u8, q8u8);
+            q1u8 = vrhaddq_u8(q1u8, q9u8);
+            q2u8 = vrhaddq_u8(q2u8, q10u8);
+            q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q1u8);
+            vst1q_u8(dst + 32, q2u8);
+            vst1q_u8(dst + 48, q3u8);
+            dst += dst_stride;
+        }
+    } else if (w == 32) {  // avg32
+        for (; h > 0; h -= 2) {
+            q0u8 = vld1q_u8(src);
+            q1u8 = vld1q_u8(src + 16);
+            src += src_stride;
+            q2u8 = vld1q_u8(src);
+            q3u8 = vld1q_u8(src + 16);
+            src += src_stride;
+            q8u8 = vld1q_u8(d);
+            q9u8 = vld1q_u8(d + 16);
+            d += dst_stride;
+            q10u8 = vld1q_u8(d);
+            q11u8 = vld1q_u8(d + 16);
+            d += dst_stride;
+
+            q0u8 = vrhaddq_u8(q0u8, q8u8);
+            q1u8 = vrhaddq_u8(q1u8, q9u8);
+            q2u8 = vrhaddq_u8(q2u8, q10u8);
+            q3u8 = vrhaddq_u8(q3u8, q11u8);
+
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q1u8);
+            dst += dst_stride;
+            vst1q_u8(dst, q2u8);
+            vst1q_u8(dst + 16, q3u8);
+            dst += dst_stride;
+        }
+    } else if (w > 8) {  // avg16
+        for (; h > 0; h -= 2) {
+            q0u8 = vld1q_u8(src);
+            src += src_stride;
+            q1u8 = vld1q_u8(src);
+            src += src_stride;
+            q2u8 = vld1q_u8(d);
+            d += dst_stride;
+            q3u8 = vld1q_u8(d);
+            d += dst_stride;
+
+            q0u8 = vrhaddq_u8(q0u8, q2u8);
+            q1u8 = vrhaddq_u8(q1u8, q3u8);
+
+            vst1q_u8(dst, q0u8);
+            dst += dst_stride;
+            vst1q_u8(dst, q1u8);
+            dst += dst_stride;
+        }
+    } else if (w == 8) {  // avg8
+        for (; h > 0; h -= 2) {
+            d0u8 = vld1_u8(src);
+            src += src_stride;
+            d1u8 = vld1_u8(src);
+            src += src_stride;
+            d2u8 = vld1_u8(d);
+            d += dst_stride;
+            d3u8 = vld1_u8(d);
+            d += dst_stride;
+
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            q0u8 = vrhaddq_u8(q0u8, q1u8);
+
+            vst1_u8(dst, vget_low_u8(q0u8));
+            dst += dst_stride;
+            vst1_u8(dst, vget_high_u8(q0u8));
+            dst += dst_stride;
+        }
+    } else {  // avg4
+        for (; h > 0; h -= 2) {
+            d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0);
+            src += src_stride;
+            d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1);
+            src += src_stride;
+            d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0);
+            d += dst_stride;
+            d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1);
+            d += dst_stride;
+
+            d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32),
+                             vreinterpret_u8_u32(d2u32));
+
+            d0u32 = vreinterpret_u32_u8(d0u8);
+            vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+            dst += dst_stride;
+            vst1_lane_u32((uint32_t *)dst, d0u32, 1);
+            dst += dst_stride;
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm
index 7d245302177..7d245302177 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_avg_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_avg_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
index f0881b5ae9c..2e28cb20ebd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_convolve_neon.c
@@ -20,7 +20,7 @@ void vp9_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
    */
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
   int intermediate_height = h + 7;
@@ -56,7 +56,7 @@ void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             const int16_t *filter_x, int x_step_q4,
                             const int16_t *filter_y, int y_step_q4,
                             int w, int h) {
-  DECLARE_ALIGNED_ARRAY(8, uint8_t, temp, 64 * 72);
+  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
   int intermediate_height = h + 7;
 
   if (x_step_q4 != 16 || y_step_q4 != 16) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.c
new file mode 100644
index 00000000000..f334abe1130
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.c
@@ -0,0 +1,92 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_convolve_copy_neon(
+        const uint8_t *src,    // r0
+        ptrdiff_t src_stride,  // r1
+        uint8_t *dst,          // r2
+        ptrdiff_t dst_stride,  // r3
+        const int16_t *filter_x,
+        int filter_x_stride,
+        const int16_t *filter_y,
+        int filter_y_stride,
+        int w,
+        int h) {
+    uint8x8_t d0u8, d2u8;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    (void)filter_x;  (void)filter_x_stride;
+    (void)filter_y;  (void)filter_y_stride;
+
+    if (w > 32) {  // copy64
+        for (; h > 0; h--) {
+            q0u8 = vld1q_u8(src);
+            q1u8 = vld1q_u8(src + 16);
+            q2u8 = vld1q_u8(src + 32);
+            q3u8 = vld1q_u8(src + 48);
+            src += src_stride;
+
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q1u8);
+            vst1q_u8(dst + 32, q2u8);
+            vst1q_u8(dst + 48, q3u8);
+            dst += dst_stride;
+        }
+    } else if (w == 32) {  // copy32
+        for (; h > 0; h -= 2) {
+            q0u8 = vld1q_u8(src);
+            q1u8 = vld1q_u8(src + 16);
+            src += src_stride;
+            q2u8 = vld1q_u8(src);
+            q3u8 = vld1q_u8(src + 16);
+            src += src_stride;
+
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q1u8);
+            dst += dst_stride;
+            vst1q_u8(dst, q2u8);
+            vst1q_u8(dst + 16, q3u8);
+            dst += dst_stride;
+        }
+    } else if (w > 8) {  // copy16
+        for (; h > 0; h -= 2) {
+            q0u8 = vld1q_u8(src);
+            src += src_stride;
+            q1u8 = vld1q_u8(src);
+            src += src_stride;
+
+            vst1q_u8(dst, q0u8);
+            dst += dst_stride;
+            vst1q_u8(dst, q1u8);
+            dst += dst_stride;
+        }
+    } else if (w == 8) {  // copy8
+        for (; h > 0; h -= 2) {
+            d0u8 = vld1_u8(src);
+            src += src_stride;
+            d2u8 = vld1_u8(src);
+            src += src_stride;
+
+            vst1_u8(dst, d0u8);
+            dst += dst_stride;
+            vst1_u8(dst, d2u8);
+            dst += dst_stride;
+        }
+    } else {  // copy4
+        for (; h > 0; h--) {
+            *(uint32_t *)dst = *(const uint32_t *)src;
+            src += src_stride;
+            dst += dst_stride;
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon_asm.asm
index a0bd04a35f0..a0bd04a35f0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_copy_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
deleted file mode 100644
index 60a0d98c56c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license and patent
-;  grant that can be found in the LICENSE file in the root of the source
-;  tree. All contributing project authors may be found in the AUTHORS
-;  file in the root of the source tree.
-;
-
-
-    EXPORT  |vp9_dc_only_idct_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp9_dc_only_idct_add_neon(int input_dc, uint8_t *pred_ptr,
-;                            uint8_t *dst_ptr, int pitch, int stride)
-;
-; r0  int input_dc
-; r1  uint8_t *pred_ptr
-; r2  uint8_t *dst_ptr
-; r3  int pitch
-; sp  int stride
-
-|vp9_dc_only_idct_add_neon| PROC
-
-    ; generate cospi_16_64 = 11585
-    mov              r12, #0x2d00
-    add              r12, #0x41
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    mul              r0, r0, r12               ; input_dc * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; dct_const_round_shift(out * cospi_16_64)
-    mul              r0, r0, r12               ; out * cospi_16_64
-    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
-    asr              r0, r0, #14               ; >> DCT_CONST_BITS
-
-    ; ROUND_POWER_OF_TWO(out, 4)
-    add              r0, r0, #8                ; + (1 <<((4) - 1))
-    asr              r0, r0, #4                ; >> 4
-
-    vdup.16         q0, r0;                   ; duplicate a1
-    ldr              r12, [sp]                 ; load stride
-
-    vld1.32         {d2[0]}, [r1], r3
-    vld1.32         {d2[1]}, [r1], r3
-    vld1.32         {d4[0]}, [r1], r3
-    vld1.32         {d4[1]}, [r1]
-
-    vaddw.u8        q1, q0, d2                ; a1 + pred_ptr[c]
-    vaddw.u8        q2, q0, d4
-
-    vqmovun.s16     d2, q1                    ; clip_pixel
-    vqmovun.s16     d4, q2
-
-    vst1.32         {d2[0]}, [r2], r12
-    vst1.32         {d2[1]}, [r2], r12
-    vst1.32         {d4[0]}, [r2], r12
-    vst1.32         {d4[1]}, [r2]
-
-    bx               lr
-    ENDP             ; |vp9_dc_only_idct_add_neon|
-
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c
new file mode 100644
index 00000000000..3c8c6a9348d
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.c
@@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct16x16_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d2u8, d3u8, d30u8, d31u8;
+    uint64x1_t d2u64, d3u64, d4u64, d5u64;
+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, j, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+
+    q0s16 = vdupq_n_s16(a1);
+    q0u16 = vreinterpretq_u16_s16(q0s16);
+
+    for (d1 = d2 = dest, i = 0; i < 4; i++) {
+        for (j = 0; j < 2; j++) {
+            d2u64 = vld1_u64((const uint64_t *)d1);
+            d3u64 = vld1_u64((const uint64_t *)(d1 + 8));
+            d1 += dest_stride;
+            d4u64 = vld1_u64((const uint64_t *)d1);
+            d5u64 = vld1_u64((const uint64_t *)(d1 + 8));
+            d1 += dest_stride;
+
+            q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+            q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+            q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+            q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+            d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+            d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+            d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+            d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8));
+            d2 += dest_stride;
+            vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+            vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8));
+            d2 += dest_stride;
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm
index b1fd21bb61f..b1fd21bb61f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
new file mode 100644
index 00000000000..5fa3f5c017b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.c
@@ -0,0 +1,1332 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+void vp9_idct16x16_256_add_neon_pass1(
+        int16_t *in,
+        int16_t *out,
+        int output_stride) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(in);
+    q8s16  = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q9s16  = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q10s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q11s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q12s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q13s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q14s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    // stage 3
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d18s16, d1s16);
+    q6s32 = vmull_s16(d19s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d30s16, d0s16);
+    q6s32 = vmlal_s16(q6s32, d31s16, d0s16);
+
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d8s16 = vqrshrn_n_s32(q2s32, 14);
+    d9s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q5s32, 14);
+    d15s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    q2s32 = vmull_s16(d26s16, d2s16);
+    q3s32 = vmull_s16(d27s16, d2s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q15s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlsl_s16(q2s32, d22s16, d3s16);
+    q3s32 = vmlsl_s16(q3s32, d23s16, d3s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q15s32 = vmlal_s16(q15s32, d23s16, d2s16);
+
+    d10s16 = vqrshrn_n_s32(q2s32, 14);
+    d11s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q15s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 4
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d30s16);
+    q11s32 = vmull_s16(d17s16, d30s16);
+    q0s32 = vmull_s16(d24s16, d30s16);
+    q1s32 = vmull_s16(d25s16, d30s16);
+
+    d30s16 = vdup_n_s16(cospi_24_64);
+    d31s16 = vdup_n_s16(cospi_8_64);
+
+    q3s32 = vaddq_s32(q2s32, q0s32);
+    q12s32 = vaddq_s32(q11s32, q1s32);
+    q13s32 = vsubq_s32(q2s32, q0s32);
+    q1s32 = vsubq_s32(q11s32, q1s32);
+
+    d16s16 = vqrshrn_n_s32(q3s32, 14);
+    d17s16 = vqrshrn_n_s32(q12s32, 14);
+    d18s16 = vqrshrn_n_s32(q13s32, 14);
+    d19s16 = vqrshrn_n_s32(q1s32, 14);
+    q8s16 = vcombine_s16(d16s16, d17s16);
+    q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q0s32 = vmull_s16(d20s16, d31s16);
+    q1s32 = vmull_s16(d21s16, d31s16);
+    q12s32 = vmull_s16(d20s16, d30s16);
+    q13s32 = vmull_s16(d21s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d29s16, d30s16);
+    q12s32 = vmlsl_s16(q12s32, d28s16, d31s16);
+    q13s32 = vmlsl_s16(q13s32, d29s16, d31s16);
+
+    d22s16 = vqrshrn_n_s32(q0s32, 14);
+    d23s16 = vqrshrn_n_s32(q1s32, 14);
+    d20s16 = vqrshrn_n_s32(q12s32, 14);
+    d21s16 = vqrshrn_n_s32(q13s32, 14);
+    q10s16 = vcombine_s16(d20s16, d21s16);
+    q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q15s16 = vaddq_s16(q6s16, q7s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    // stage 5
+    q0s16 = vaddq_s16(q8s16, q11s16);
+    q1s16 = vaddq_s16(q9s16, q10s16);
+    q2s16 = vsubq_s16(q9s16, q10s16);
+    q3s16 = vsubq_s16(q8s16, q11s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q11s32 = vmull_s16(d26s16, d16s16);
+    q12s32 = vmull_s16(d27s16, d16s16);
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+
+    q6s32 = vsubq_s32(q9s32, q11s32);
+    q13s32 = vsubq_s32(q10s32, q12s32);
+    q9s32 = vaddq_s32(q9s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q12s32);
+
+    d10s16 = vqrshrn_n_s32(q6s32, 14);
+    d11s16 = vqrshrn_n_s32(q13s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q10s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 6
+    q8s16 = vaddq_s16(q0s16, q15s16);
+    q9s16 = vaddq_s16(q1s16, q6s16);
+    q10s16 = vaddq_s16(q2s16, q5s16);
+    q11s16 = vaddq_s16(q3s16, q4s16);
+    q12s16 = vsubq_s16(q3s16, q4s16);
+    q13s16 = vsubq_s16(q2s16, q5s16);
+    q14s16 = vsubq_s16(q1s16, q6s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    // store the data
+    output_stride >>= 1;  // output_stride / 2, out is int16_t
+    vst1_u64((uint64_t *)out, d16u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d17u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d20u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d21u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d22u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d23u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d24u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
+
+void vp9_idct16x16_256_add_neon_pass2(
+        int16_t *src,
+        int16_t *out,
+        int16_t *pass1Output,
+        int16_t skip_adding,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d;
+    uint8x8_t d12u8, d13u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64;
+    int64x1_t d12s64, d13s64;
+    uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16;
+    uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(src);
+    q8s16  = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q9s16  = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q10s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q11s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q12s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q13s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q14s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+    d30s16 = vget_low_s16(q15s16);
+    d31s16 = vget_high_s16(q15s16);
+
+    // stage 3
+    d12s16 = vdup_n_s16(cospi_30_64);
+    d13s16 = vdup_n_s16(cospi_2_64);
+
+    q2s32 = vmull_s16(d16s16, d12s16);
+    q3s32 = vmull_s16(d17s16, d12s16);
+    q1s32 = vmull_s16(d16s16, d13s16);
+    q4s32 = vmull_s16(d17s16, d13s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d13s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d13s16);
+    q1s32 = vmlal_s16(q1s32, d30s16, d12s16);
+    q4s32 = vmlal_s16(q4s32, d31s16, d12s16);
+
+    d0s16 = vqrshrn_n_s32(q2s32, 14);
+    d1s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q1s32, 14);
+    d15s16 = vqrshrn_n_s32(q4s32, 14);
+    q0s16 = vcombine_s16(d0s16, d1s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_14_64);
+    d31s16 = vdup_n_s16(cospi_18_64);
+
+    q2s32 = vmull_s16(d24s16, d30s16);
+    q3s32 = vmull_s16(d25s16, d30s16);
+    q4s32 = vmull_s16(d24s16, d31s16);
+    q5s32 = vmull_s16(d25s16, d31s16);
+
+    q2s32 = vmlsl_s16(q2s32, d22s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d23s16, d31s16);
+    q4s32 = vmlal_s16(q4s32, d22s16, d30s16);
+    q5s32 = vmlal_s16(q5s32, d23s16, d30s16);
+
+    d2s16 = vqrshrn_n_s32(q2s32, 14);
+    d3s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q4s32, 14);
+    d13s16 = vqrshrn_n_s32(q5s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    d30s16 = vdup_n_s16(cospi_22_64);
+    d31s16 = vdup_n_s16(cospi_10_64);
+
+    q11s32 = vmull_s16(d20s16, d30s16);
+    q12s32 = vmull_s16(d21s16, d30s16);
+    q4s32 = vmull_s16(d20s16, d31s16);
+    q5s32 = vmull_s16(d21s16, d31s16);
+
+    q11s32 = vmlsl_s16(q11s32, d26s16, d31s16);
+    q12s32 = vmlsl_s16(q12s32, d27s16, d31s16);
+    q4s32 = vmlal_s16(q4s32, d26s16, d30s16);
+    q5s32 = vmlal_s16(q5s32, d27s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q11s32, 14);
+    d5s16 = vqrshrn_n_s32(q12s32, 14);
+    d11s16 = vqrshrn_n_s32(q5s32, 14);
+    d10s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    d30s16 = vdup_n_s16(cospi_6_64);
+    d31s16 = vdup_n_s16(cospi_26_64);
+
+    q10s32 = vmull_s16(d28s16, d30s16);
+    q11s32 = vmull_s16(d29s16, d30s16);
+    q12s32 = vmull_s16(d28s16, d31s16);
+    q13s32 = vmull_s16(d29s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d18s16, d31s16);
+    q11s32 = vmlsl_s16(q11s32, d19s16, d31s16);
+    q12s32 = vmlal_s16(q12s32, d18s16, d30s16);
+    q13s32 = vmlal_s16(q13s32, d19s16, d30s16);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q11s32, 14);
+    d8s16 = vqrshrn_n_s32(q12s32, 14);
+    d9s16 = vqrshrn_n_s32(q13s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 3
+    q9s16  = vsubq_s16(q0s16, q1s16);
+    q0s16  = vaddq_s16(q0s16, q1s16);
+    q10s16 = vsubq_s16(q3s16, q2s16);
+    q11s16 = vaddq_s16(q2s16, q3s16);
+    q12s16 = vaddq_s16(q4s16, q5s16);
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16  = vaddq_s16(q6s16, q7s16);
+
+    // stage 4
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    q2s32 = vmull_s16(d18s16, d31s16);
+    q3s32 = vmull_s16(d19s16, d31s16);
+    q4s32 = vmull_s16(d28s16, d31s16);
+    q5s32 = vmull_s16(d29s16, d31s16);
+
+    q2s32 = vmlal_s16(q2s32, d28s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d29s16, d30s16);
+    q4s32 = vmlsl_s16(q4s32, d18s16, d30s16);
+    q5s32 = vmlsl_s16(q5s32, d19s16, d30s16);
+
+    d12s16 = vqrshrn_n_s32(q2s32, 14);
+    d13s16 = vqrshrn_n_s32(q3s32, 14);
+    d2s16 = vqrshrn_n_s32(q4s32, 14);
+    d3s16 = vqrshrn_n_s32(q5s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q3s16 = q11s16;
+    q4s16 = q12s16;
+
+    d30s16 = vdup_n_s16(-cospi_8_64);
+    q11s32 = vmull_s16(d26s16, d30s16);
+    q12s32 = vmull_s16(d27s16, d30s16);
+    q8s32 = vmull_s16(d20s16, d30s16);
+    q9s32 = vmull_s16(d21s16, d30s16);
+
+    q11s32 = vmlsl_s16(q11s32, d20s16, d31s16);
+    q12s32 = vmlsl_s16(q12s32, d21s16, d31s16);
+    q8s32 = vmlal_s16(q8s32, d26s16, d31s16);
+    q9s32 = vmlal_s16(q9s32, d27s16, d31s16);
+
+    d4s16 = vqrshrn_n_s32(q11s32, 14);
+    d5s16 = vqrshrn_n_s32(q12s32, 14);
+    d10s16 = vqrshrn_n_s32(q8s32, 14);
+    d11s16 = vqrshrn_n_s32(q9s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    // stage 5
+    q8s16  = vaddq_s16(q0s16, q3s16);
+    q9s16  = vaddq_s16(q1s16, q2s16);
+    q10s16 = vsubq_s16(q1s16, q2s16);
+    q11s16 = vsubq_s16(q0s16, q3s16);
+    q12s16 = vsubq_s16(q7s16, q4s16);
+    q13s16 = vsubq_s16(q6s16, q5s16);
+    q14s16 = vaddq_s16(q6s16, q5s16);
+    q15s16 = vaddq_s16(q7s16, q4s16);
+
+    // stage 6
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+
+    d14s16 = vdup_n_s16(cospi_16_64);
+
+    q3s32 = vmull_s16(d26s16, d14s16);
+    q4s32 = vmull_s16(d27s16, d14s16);
+    q0s32 = vmull_s16(d20s16, d14s16);
+    q1s32 = vmull_s16(d21s16, d14s16);
+
+    q5s32 = vsubq_s32(q3s32, q0s32);
+    q6s32 = vsubq_s32(q4s32, q1s32);
+    q10s32 = vaddq_s32(q3s32, q0s32);
+    q4s32 = vaddq_s32(q4s32, q1s32);
+
+    d4s16 = vqrshrn_n_s32(q5s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    d10s16 = vqrshrn_n_s32(q10s32, 14);
+    d11s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q0s32 = vmull_s16(d22s16, d14s16);
+    q1s32 = vmull_s16(d23s16, d14s16);
+    q13s32 = vmull_s16(d24s16, d14s16);
+    q6s32 = vmull_s16(d25s16, d14s16);
+
+    q10s32 = vsubq_s32(q13s32, q0s32);
+    q4s32 = vsubq_s32(q6s32, q1s32);
+    q13s32 = vaddq_s32(q13s32, q0s32);
+    q6s32 = vaddq_s32(q6s32, q1s32);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q4s32, 14);
+    d8s16 = vqrshrn_n_s32(q13s32, 14);
+    d9s16 = vqrshrn_n_s32(q6s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 7
+    if (skip_adding != 0) {
+        d = dest;
+        // load the data in pass1
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+
+        q12s16 = vaddq_s16(q0s16, q15s16);
+        q13s16 = vaddq_s16(q1s16, q14s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q14s16 = vsubq_s16(q1s16, q14s16);
+        q15s16 = vsubq_s16(q0s16, q15s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q10s16, q5s16);
+        q13s16 = vaddq_s16(q11s16, q4s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q4s16 = vsubq_s16(q11s16, q4s16);
+        q5s16 = vsubq_s16(q10s16, q5s16);
+
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q0s16, q3s16);
+        q13s16 = vaddq_s16(q1s16, q2s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q2s16 = vsubq_s16(q1s16, q2s16);
+        q3s16 = vsubq_s16(q0s16, q3s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        d13s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q12s16 = vaddq_s16(q10s16, q9s16);
+        q13s16 = vaddq_s16(q11s16, q8s16);
+        q12s16 = vrshrq_n_s16(q12s16, 6);
+        q13s16 = vrshrq_n_s16(q13s16, 6);
+        q12u16 = vaddw_u8(vreinterpretq_u16_s16(q12s16),
+                          vreinterpret_u8_s64(d12s64));
+        q13u16 = vaddw_u8(vreinterpretq_u16_s16(q13s16),
+                          vreinterpret_u8_s64(d13s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+        d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8));
+        d += dest_stride;
+        q8s16 = vsubq_s16(q11s16, q8s16);
+        q9s16 = vsubq_s16(q10s16, q9s16);
+
+        // store the data  out 8,9,10,11,12,13,14,15
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q8s16 = vrshrq_n_s16(q8s16, 6);
+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q9s16 = vrshrq_n_s16(q9s16, 6);
+        q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q2s16 = vrshrq_n_s16(q2s16, 6);
+        q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q3s16 = vrshrq_n_s16(q3s16, 6);
+        q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q4s16 = vrshrq_n_s16(q4s16, 6);
+        q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q5s16 = vrshrq_n_s16(q5s16, 6);
+        q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16),
+                         vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        dest += dest_stride;
+        q14s16 = vrshrq_n_s16(q14s16, 6);
+        q14u16 = vaddw_u8(vreinterpretq_u16_s16(q14s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+        d += dest_stride;
+
+        d12s64 = vld1_s64((int64_t *)dest);
+        q15s16 = vrshrq_n_s16(q15s16, 6);
+        q15u16 = vaddw_u8(vreinterpretq_u16_s16(q15s16),
+                          vreinterpret_u8_s64(d12s64));
+        d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16));
+        vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8));
+    } else {  // skip_adding_dest
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q0s16, q15s16);
+        q13s16 = vaddq_s16(q1s16, q14s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q14s16 = vsubq_s16(q1s16, q14s16);
+        q15s16 = vsubq_s16(q0s16, q15s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q10s16, q5s16);
+        q13s16 = vaddq_s16(q11s16, q4s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q4s16 = vsubq_s16(q11s16, q4s16);
+        q5s16 = vsubq_s16(q10s16, q5s16);
+
+        q0s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q1s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q0s16, q3s16);
+        q13s16 = vaddq_s16(q1s16, q2s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q2s16 = vsubq_s16(q1s16, q2s16);
+        q3s16 = vsubq_s16(q0s16, q3s16);
+
+        q10s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q11s16 = vld1q_s16(pass1Output);
+        pass1Output += 8;
+        q12s16 = vaddq_s16(q10s16, q9s16);
+        q13s16 = vaddq_s16(q11s16, q8s16);
+        d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+        d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+        d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+        d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+        vst1_u64((uint64_t *)out, d24u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d25u64);
+        out += 12;
+        vst1_u64((uint64_t *)out, d26u64);
+        out += 4;
+        vst1_u64((uint64_t *)out, d27u64);
+        out += 12;
+        q8s16 = vsubq_s16(q11s16, q8s16);
+        q9s16 = vsubq_s16(q10s16, q9s16);
+
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16)));
+        out += 12;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16)));
+        out += 4;
+        vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16)));
+    }
+    return;
+}
+
+void vp9_idct16x16_10_add_neon_pass1(
+        int16_t *in,
+        int16_t *out,
+        int output_stride) {
+    int16x4_t d4s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q6s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q15s32;
+    int16x8x2_t q0x2s16;
+
+    q0x2s16 = vld2q_s16(in);
+    q8s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q9s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q10s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q11s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q12s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q13s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q14s16 = q0x2s16.val[0];
+    in += 16;
+    q0x2s16 = vld2q_s16(in);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // stage 3
+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+    // stage 4
+    q1s16 = vdupq_n_s16(cospi_16_64 * 2);
+    d4s16 = vdup_n_s16(cospi_16_64);
+
+    q8s16 = vqrdmulhq_s16(q8s16, q1s16);
+
+    d8s16 = vget_low_s16(q4s16);
+    d9s16 = vget_high_s16(q4s16);
+    d14s16 = vget_low_s16(q7s16);
+    d15s16 = vget_high_s16(q7s16);
+    q9s32  = vmull_s16(d14s16, d4s16);
+    q10s32 = vmull_s16(d15s16, d4s16);
+    q12s32 = vmull_s16(d9s16, d4s16);
+    q11s32 = vmull_s16(d8s16, d4s16);
+
+    q15s32 = vsubq_s32(q10s32, q12s32);
+    q6s32 = vsubq_s32(q9s32, q11s32);
+    q9s32 = vaddq_s32(q9s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q12s32);
+
+    d11s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q6s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q10s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 6
+    q2s16 = vaddq_s16(q8s16, q7s16);
+    q9s16 = vaddq_s16(q8s16, q6s16);
+    q10s16 = vaddq_s16(q8s16, q5s16);
+    q11s16 = vaddq_s16(q8s16, q4s16);
+    q12s16 = vsubq_s16(q8s16, q4s16);
+    q13s16 = vsubq_s16(q8s16, q5s16);
+    q14s16 = vsubq_s16(q8s16, q6s16);
+    q15s16 = vsubq_s16(q8s16, q7s16);
+
+    d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16));
+    d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16));
+    d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16));
+    d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16));
+    d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16));
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    // store the data
+    output_stride >>= 1;  // output_stride / 2, out is int16_t
+    vst1_u64((uint64_t *)out, d4u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d5u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d20u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d21u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d22u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d23u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d24u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += output_stride;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
+
+void vp9_idct16x16_10_add_neon_pass2(
+        int16_t *src,
+        int16_t *out,
+        int16_t *pass1Output,
+        int16_t skip_adding,
+        uint8_t *dest,
+        int dest_stride) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16;
+    uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64;
+    uint64x1_t d16u64, d17u64, d18u64, d19u64;
+    uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32;
+    int16x8x2_t q0x2s16;
+    (void)skip_adding;
+    (void)dest;
+    (void)dest_stride;
+
+    q0x2s16 = vld2q_s16(src);
+    q8s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q9s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q10s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q11s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q12s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q13s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q14s16 = q0x2s16.val[0];
+    src += 16;
+    q0x2s16 = vld2q_s16(src);
+    q15s16 = q0x2s16.val[0];
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // stage 3
+    q6s16 = vdupq_n_s16(cospi_30_64 * 2);
+    q0s16 = vqrdmulhq_s16(q8s16, q6s16);
+    q6s16 = vdupq_n_s16(cospi_2_64 * 2);
+    q7s16 = vqrdmulhq_s16(q8s16, q6s16);
+
+    q15s16 = vdupq_n_s16(-cospi_26_64 * 2);
+    q14s16 = vdupq_n_s16(cospi_6_64 * 2);
+    q3s16 = vqrdmulhq_s16(q9s16, q15s16);
+    q4s16 = vqrdmulhq_s16(q9s16, q14s16);
+
+    // stage 4
+    d0s16 = vget_low_s16(q0s16);
+    d1s16 = vget_high_s16(q0s16);
+    d6s16 = vget_low_s16(q3s16);
+    d7s16 = vget_high_s16(q3s16);
+    d8s16 = vget_low_s16(q4s16);
+    d9s16 = vget_high_s16(q4s16);
+    d14s16 = vget_low_s16(q7s16);
+    d15s16 = vget_high_s16(q7s16);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    q12s32 = vmull_s16(d14s16, d31s16);
+    q5s32 = vmull_s16(d15s16, d31s16);
+    q2s32 = vmull_s16(d0s16, d31s16);
+    q11s32 = vmull_s16(d1s16, d31s16);
+
+    q12s32 = vmlsl_s16(q12s32, d0s16, d30s16);
+    q5s32 = vmlsl_s16(q5s32, d1s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d14s16, d30s16);
+    q11s32 = vmlal_s16(q11s32, d15s16, d30s16);
+
+    d2s16 = vqrshrn_n_s32(q12s32, 14);
+    d3s16 = vqrshrn_n_s32(q5s32, 14);
+    d12s16 = vqrshrn_n_s32(q2s32, 14);
+    d13s16 = vqrshrn_n_s32(q11s32, 14);
+    q1s16 = vcombine_s16(d2s16, d3s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    d30s16 = vdup_n_s16(-cospi_8_64);
+    q10s32 = vmull_s16(d8s16, d30s16);
+    q13s32 = vmull_s16(d9s16, d30s16);
+    q8s32 = vmull_s16(d6s16, d30s16);
+    q9s32 = vmull_s16(d7s16, d30s16);
+
+    q10s32 = vmlsl_s16(q10s32, d6s16, d31s16);
+    q13s32 = vmlsl_s16(q13s32, d7s16, d31s16);
+    q8s32 = vmlal_s16(q8s32, d8s16, d31s16);
+    q9s32 = vmlal_s16(q9s32, d9s16, d31s16);
+
+    d4s16 = vqrshrn_n_s32(q10s32, 14);
+    d5s16 = vqrshrn_n_s32(q13s32, 14);
+    d10s16 = vqrshrn_n_s32(q8s32, 14);
+    d11s16 = vqrshrn_n_s32(q9s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    // stage 5
+    q8s16  = vaddq_s16(q0s16, q3s16);
+    q9s16  = vaddq_s16(q1s16, q2s16);
+    q10s16 = vsubq_s16(q1s16, q2s16);
+    q11s16 = vsubq_s16(q0s16, q3s16);
+    q12s16 = vsubq_s16(q7s16, q4s16);
+    q13s16 = vsubq_s16(q6s16, q5s16);
+    q14s16 = vaddq_s16(q6s16, q5s16);
+    q15s16 = vaddq_s16(q7s16, q4s16);
+
+    // stage 6
+    d20s16 = vget_low_s16(q10s16);
+    d21s16 = vget_high_s16(q10s16);
+    d22s16 = vget_low_s16(q11s16);
+    d23s16 = vget_high_s16(q11s16);
+    d24s16 = vget_low_s16(q12s16);
+    d25s16 = vget_high_s16(q12s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+
+    d14s16 = vdup_n_s16(cospi_16_64);
+    q3s32 = vmull_s16(d26s16, d14s16);
+    q4s32 = vmull_s16(d27s16, d14s16);
+    q0s32 = vmull_s16(d20s16, d14s16);
+    q1s32 = vmull_s16(d21s16, d14s16);
+
+    q5s32 = vsubq_s32(q3s32, q0s32);
+    q6s32 = vsubq_s32(q4s32, q1s32);
+    q0s32 = vaddq_s32(q3s32, q0s32);
+    q4s32 = vaddq_s32(q4s32, q1s32);
+
+    d4s16 = vqrshrn_n_s32(q5s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    d10s16 = vqrshrn_n_s32(q0s32, 14);
+    d11s16 = vqrshrn_n_s32(q4s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q0s32 = vmull_s16(d22s16, d14s16);
+    q1s32 = vmull_s16(d23s16, d14s16);
+    q13s32 = vmull_s16(d24s16, d14s16);
+    q6s32 = vmull_s16(d25s16, d14s16);
+
+    q10s32 = vsubq_s32(q13s32, q0s32);
+    q4s32 = vsubq_s32(q6s32, q1s32);
+    q13s32 = vaddq_s32(q13s32, q0s32);
+    q6s32 = vaddq_s32(q6s32, q1s32);
+
+    d6s16 = vqrshrn_n_s32(q10s32, 14);
+    d7s16 = vqrshrn_n_s32(q4s32, 14);
+    d8s16 = vqrshrn_n_s32(q13s32, 14);
+    d9s16 = vqrshrn_n_s32(q6s32, 14);
+    q3s16 = vcombine_s16(d6s16, d7s16);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+
+    // stage 7
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q15s16);
+    q13s16 = vaddq_s16(q1s16, q14s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q14s16 = vsubq_s16(q1s16, q14s16);
+    q15s16 = vsubq_s16(q0s16, q15s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q10s16, q5s16);
+    q13s16 = vaddq_s16(q11s16, q4s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q4s16 = vsubq_s16(q11s16, q4s16);
+    q5s16 = vsubq_s16(q10s16, q5s16);
+
+    q0s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q1s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q12s16 = vaddq_s16(q0s16, q3s16);
+    q13s16 = vaddq_s16(q1s16, q2s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q2s16 = vsubq_s16(q1s16, q2s16);
+    q3s16 = vsubq_s16(q0s16, q3s16);
+
+    q10s16 = vld1q_s16(pass1Output);
+    pass1Output += 8;
+    q11s16 = vld1q_s16(pass1Output);
+    q12s16 = vaddq_s16(q10s16, q9s16);
+    q13s16 = vaddq_s16(q11s16, q8s16);
+    d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16));
+    d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16));
+    d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16));
+    d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16));
+    vst1_u64((uint64_t *)out, d24u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d25u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d26u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d27u64);
+    out += 12;
+    q8s16 = vsubq_s16(q11s16, q8s16);
+    q9s16 = vsubq_s16(q10s16, q9s16);
+
+    d4u64  = vreinterpret_u64_s16(vget_low_s16(q2s16));
+    d5u64  = vreinterpret_u64_s16(vget_high_s16(q2s16));
+    d6u64  = vreinterpret_u64_s16(vget_low_s16(q3s16));
+    d7u64  = vreinterpret_u64_s16(vget_high_s16(q3s16));
+    d8u64  = vreinterpret_u64_s16(vget_low_s16(q4s16));
+    d9u64  = vreinterpret_u64_s16(vget_high_s16(q4s16));
+    d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16));
+    d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16));
+    d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16));
+    d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16));
+    d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16));
+    d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16));
+    d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16));
+    d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16));
+    d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16));
+    d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16));
+
+    vst1_u64((uint64_t *)out, d16u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d17u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d18u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d19u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d4u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d5u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d6u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d7u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d8u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d9u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d10u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d11u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d28u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d29u64);
+    out += 12;
+    vst1_u64((uint64_t *)out, d30u64);
+    out += 4;
+    vst1_u64((uint64_t *)out, d31u64);
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
index a13c0d04b83..a13c0d04b83 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
index 0b9fc09abd7..f2c4ec4518c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -30,18 +30,24 @@ void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
                                      uint8_t *dest,
                                      int dest_stride);
 
+#if HAVE_NEON_ASM
 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
 extern void vp9_push_neon(int64_t *store);
 extern void vp9_pop_neon(int64_t *store);
+#endif  // HAVE_NEON_ASM
 
 void vp9_idct16x16_256_add_neon(const int16_t *input,
                                 uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
   int64_t store_reg[8];
+#endif
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
+#if HAVE_NEON_ASM
   // save d8-d15 register values.
   vp9_push_neon(store_reg);
+#endif
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -103,20 +109,26 @@ void vp9_idct16x16_256_add_neon(const int16_t *input,
                                      dest+8,
                                      dest_stride);
 
+#if HAVE_NEON_ASM
   // restore d8-d15 register values.
   vp9_pop_neon(store_reg);
+#endif
 
   return;
 }
 
 void vp9_idct16x16_10_add_neon(const int16_t *input,
                                uint8_t *dest, int dest_stride) {
+#if HAVE_NEON_ASM
   int64_t store_reg[8];
+#endif
   int16_t pass1_output[16*16] = {0};
   int16_t row_idct_output[16*16] = {0};
 
+#if HAVE_NEON_ASM
   // save d8-d15 register values.
   vp9_push_neon(store_reg);
+#endif
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -165,8 +177,10 @@ void vp9_idct16x16_10_add_neon(const int16_t *input,
                                      dest+8,
                                      dest_stride);
 
+#if HAVE_NEON_ASM
   // restore d8-d15 register values.
   vp9_pop_neon(store_reg);
+#endif
 
   return;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
new file mode 100644
index 00000000000..d0e4b4f4014
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.c
@@ -0,0 +1,163 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+#include "./vpx_config.h"
+
+static INLINE void LD_16x8(
+        uint8_t *d,
+        int d_stride,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vld1q_u8(d);
+    d += d_stride;
+    *q9u8 = vld1q_u8(d);
+    d += d_stride;
+    *q10u8 = vld1q_u8(d);
+    d += d_stride;
+    *q11u8 = vld1q_u8(d);
+    d += d_stride;
+    *q12u8 = vld1q_u8(d);
+    d += d_stride;
+    *q13u8 = vld1q_u8(d);
+    d += d_stride;
+    *q14u8 = vld1q_u8(d);
+    d += d_stride;
+    *q15u8 = vld1q_u8(d);
+    return;
+}
+
+static INLINE void ADD_DIFF_16x8(
+        uint8x16_t qdiffu8,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vqaddq_u8(*q8u8, qdiffu8);
+    *q9u8 = vqaddq_u8(*q9u8, qdiffu8);
+    *q10u8 = vqaddq_u8(*q10u8, qdiffu8);
+    *q11u8 = vqaddq_u8(*q11u8, qdiffu8);
+    *q12u8 = vqaddq_u8(*q12u8, qdiffu8);
+    *q13u8 = vqaddq_u8(*q13u8, qdiffu8);
+    *q14u8 = vqaddq_u8(*q14u8, qdiffu8);
+    *q15u8 = vqaddq_u8(*q15u8, qdiffu8);
+    return;
+}
+
+static INLINE void SUB_DIFF_16x8(
+        uint8x16_t qdiffu8,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    *q8u8 = vqsubq_u8(*q8u8, qdiffu8);
+    *q9u8 = vqsubq_u8(*q9u8, qdiffu8);
+    *q10u8 = vqsubq_u8(*q10u8, qdiffu8);
+    *q11u8 = vqsubq_u8(*q11u8, qdiffu8);
+    *q12u8 = vqsubq_u8(*q12u8, qdiffu8);
+    *q13u8 = vqsubq_u8(*q13u8, qdiffu8);
+    *q14u8 = vqsubq_u8(*q14u8, qdiffu8);
+    *q15u8 = vqsubq_u8(*q15u8, qdiffu8);
+    return;
+}
+
+static INLINE void ST_16x8(
+        uint8_t *d,
+        int d_stride,
+        uint8x16_t *q8u8,
+        uint8x16_t *q9u8,
+        uint8x16_t *q10u8,
+        uint8x16_t *q11u8,
+        uint8x16_t *q12u8,
+        uint8x16_t *q13u8,
+        uint8x16_t *q14u8,
+        uint8x16_t *q15u8) {
+    vst1q_u8(d, *q8u8);
+    d += d_stride;
+    vst1q_u8(d, *q9u8);
+    d += d_stride;
+    vst1q_u8(d, *q10u8);
+    d += d_stride;
+    vst1q_u8(d, *q11u8);
+    d += d_stride;
+    vst1q_u8(d, *q12u8);
+    d += d_stride;
+    vst1q_u8(d, *q13u8);
+    d += d_stride;
+    vst1q_u8(d, *q14u8);
+    d += d_stride;
+    vst1q_u8(d, *q15u8);
+    return;
+}
+
+void vp9_idct32x32_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int i, j, dest_stride8;
+    uint8_t *d;
+    int16_t a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 6);
+
+    dest_stride8 = dest_stride * 8;
+    if (a1 >= 0) {  // diff_positive_32_32
+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+        q0u8 = vdupq_n_u8(a1);
+        for (i = 0; i < 2; i++, dest += 16) {  // diff_positive_32_32_loop
+            d = dest;
+            for (j = 0; j < 4; j++) {
+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+                                    &q12u8, &q13u8, &q14u8, &q15u8);
+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                d += dest_stride8;
+            }
+        }
+    } else {  // diff_negative_32_32
+        a1 = -a1;
+        a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1;
+        q0u8 = vdupq_n_u8(a1);
+        for (i = 0; i < 2; i++, dest += 16) {  // diff_negative_32_32_loop
+            d = dest;
+            for (j = 0; j < 4; j++) {
+                LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8,
+                                    &q12u8, &q13u8, &q14u8, &q15u8);
+                ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8,
+                                        &q12u8, &q13u8, &q14u8, &q15u8);
+                d += dest_stride8;
+            }
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm
index d290d07531c..d290d07531c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
new file mode 100644
index 00000000000..309bdf8d756
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.c
@@ -0,0 +1,750 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static int16_t cospi_1_64 = 16364;
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_3_64 = 16207;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_5_64 = 15893;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_7_64 = 15426;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_9_64 = 14811;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_11_64 = 14053;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_13_64 = 13160;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_15_64 = 12140;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_17_64 = 11003;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_19_64 = 9760;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_21_64 = 8423;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_23_64 = 7005;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_25_64 = 5520;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_27_64 = 3981;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_29_64 = 2404;
+static int16_t cospi_30_64 = 1606;
+static int16_t cospi_31_64 = 804;
+
+#define LOAD_FROM_TRANSPOSED(prev, first, second) \
+    q14s16 = vld1q_s16(trans_buf + first * 8); \
+    q13s16 = vld1q_s16(trans_buf + second * 8);
+
+#define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
+    qA = vld1q_s16(out + first * 32); \
+    qB = vld1q_s16(out + second * 32);
+
+#define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
+    vst1q_s16(out + first * 32, qA); \
+    vst1q_s16(out + second * 32, qB);
+
+#define  STORE_COMBINE_CENTER_RESULTS(r10, r9) \
+       __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
+                                      q6s16, q7s16, q8s16, q9s16);
+static INLINE void __STORE_COMBINE_CENTER_RESULTS(
+        uint8_t *p1,
+        uint8_t *p2,
+        int stride,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16) {
+    int16x4_t d8s16, d9s16, d10s16, d11s16;
+
+    d8s16 = vld1_s16((int16_t *)p1);
+    p1 += stride;
+    d11s16 = vld1_s16((int16_t *)p2);
+    p2 -= stride;
+    d9s16 = vld1_s16((int16_t *)p1);
+    d10s16 = vld1_s16((int16_t *)p2);
+
+    q7s16 = vrshrq_n_s16(q7s16, 6);
+    q8s16 = vrshrq_n_s16(q8s16, 6);
+    q9s16 = vrshrq_n_s16(q9s16, 6);
+    q6s16 = vrshrq_n_s16(q6s16, 6);
+
+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+                                           vreinterpret_u8_s16(d9s16)));
+    q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                                           vreinterpret_u8_s16(d10s16)));
+    q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                                           vreinterpret_u8_s16(d11s16)));
+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+                                           vreinterpret_u8_s16(d8s16)));
+
+    d9s16  = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+    d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
+    d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
+    d8s16  = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+
+    vst1_s16((int16_t *)p1, d9s16);
+    p1 -= stride;
+    vst1_s16((int16_t *)p2, d10s16);
+    p2 += stride;
+    vst1_s16((int16_t *)p1, d8s16);
+    vst1_s16((int16_t *)p2, d11s16);
+    return;
+}
+
+#define  STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
+       __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
+                                      q4s16, q5s16, q6s16, q7s16);
+static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
+        uint8_t *p1,
+        uint8_t *p2,
+        int stride,
+        int16x8_t q4s16,
+        int16x8_t q5s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16) {
+    int16x4_t d4s16, d5s16, d6s16, d7s16;
+
+    d4s16 = vld1_s16((int16_t *)p1);
+    p1 += stride;
+    d7s16 = vld1_s16((int16_t *)p2);
+    p2 -= stride;
+    d5s16 = vld1_s16((int16_t *)p1);
+    d6s16 = vld1_s16((int16_t *)p2);
+
+    q5s16 = vrshrq_n_s16(q5s16, 6);
+    q6s16 = vrshrq_n_s16(q6s16, 6);
+    q7s16 = vrshrq_n_s16(q7s16, 6);
+    q4s16 = vrshrq_n_s16(q4s16, 6);
+
+    q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
+                                           vreinterpret_u8_s16(d5s16)));
+    q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
+                                           vreinterpret_u8_s16(d6s16)));
+    q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
+                                           vreinterpret_u8_s16(d7s16)));
+    q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
+                                           vreinterpret_u8_s16(d4s16)));
+
+    d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
+    d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
+    d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
+    d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
+
+    vst1_s16((int16_t *)p1, d5s16);
+    p1 -= stride;
+    vst1_s16((int16_t *)p2, d6s16);
+    p2 += stride;
+    vst1_s16((int16_t *)p2, d7s16);
+    vst1_s16((int16_t *)p1, d4s16);
+    return;
+}
+
+#define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
+        DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
+static INLINE void DO_BUTTERFLY(
+        int16x8_t q14s16,
+        int16x8_t q13s16,
+        int16_t first_const,
+        int16_t second_const,
+        int16x8_t *qAs16,
+        int16x8_t *qBs16) {
+    int16x4_t d30s16, d31s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
+    int16x4_t dCs16, dDs16, dAs16, dBs16;
+
+    dCs16 = vget_low_s16(q14s16);
+    dDs16 = vget_high_s16(q14s16);
+    dAs16 = vget_low_s16(q13s16);
+    dBs16 = vget_high_s16(q13s16);
+
+    d30s16 = vdup_n_s16(first_const);
+    d31s16 = vdup_n_s16(second_const);
+
+    q8s32 = vmull_s16(dCs16, d30s16);
+    q10s32 = vmull_s16(dAs16, d31s16);
+    q9s32 = vmull_s16(dDs16, d30s16);
+    q11s32 = vmull_s16(dBs16, d31s16);
+    q12s32 = vmull_s16(dCs16, d31s16);
+
+    q8s32 = vsubq_s32(q8s32, q10s32);
+    q9s32 = vsubq_s32(q9s32, q11s32);
+
+    q10s32 = vmull_s16(dDs16, d31s16);
+    q11s32 = vmull_s16(dAs16, d30s16);
+    q15s32 = vmull_s16(dBs16, d30s16);
+
+    q11s32 = vaddq_s32(q12s32, q11s32);
+    q10s32 = vaddq_s32(q10s32, q15s32);
+
+    *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
+                          vqrshrn_n_s32(q9s32, 14));
+    *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
+                          vqrshrn_n_s32(q10s32, 14));
+    return;
+}
+
+static INLINE void idct32_transpose_pair(
+        int16_t *input,
+        int16_t *t_buf) {
+    int16_t *in;
+    int i;
+    const int stride = 32;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    for (i = 0; i < 4; i++, input += 8) {
+        in = input;
+        q8s16 = vld1q_s16(in);
+        in += stride;
+        q9s16 = vld1q_s16(in);
+        in += stride;
+        q10s16 = vld1q_s16(in);
+        in += stride;
+        q11s16 = vld1q_s16(in);
+        in += stride;
+        q12s16 = vld1q_s16(in);
+        in += stride;
+        q13s16 = vld1q_s16(in);
+        in += stride;
+        q14s16 = vld1q_s16(in);
+        in += stride;
+        q15s16 = vld1q_s16(in);
+
+        d16s16 = vget_low_s16(q8s16);
+        d17s16 = vget_high_s16(q8s16);
+        d18s16 = vget_low_s16(q9s16);
+        d19s16 = vget_high_s16(q9s16);
+        d20s16 = vget_low_s16(q10s16);
+        d21s16 = vget_high_s16(q10s16);
+        d22s16 = vget_low_s16(q11s16);
+        d23s16 = vget_high_s16(q11s16);
+        d24s16 = vget_low_s16(q12s16);
+        d25s16 = vget_high_s16(q12s16);
+        d26s16 = vget_low_s16(q13s16);
+        d27s16 = vget_high_s16(q13s16);
+        d28s16 = vget_low_s16(q14s16);
+        d29s16 = vget_high_s16(q14s16);
+        d30s16 = vget_low_s16(q15s16);
+        d31s16 = vget_high_s16(q15s16);
+
+        q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+        q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+        q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+        q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+        q12s16 = vcombine_s16(d17s16, d25s16);
+        q13s16 = vcombine_s16(d19s16, d27s16);
+        q14s16 = vcombine_s16(d21s16, d29s16);
+        q15s16 = vcombine_s16(d23s16, d31s16);
+
+        q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                            vreinterpretq_s32_s16(q10s16));
+        q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
+                            vreinterpretq_s32_s16(q11s16));
+        q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
+                            vreinterpretq_s32_s16(q14s16));
+        q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
+                            vreinterpretq_s32_s16(q15s16));
+
+        q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                            vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+        q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                            vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+        q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                            vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+        q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                            vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+        vst1q_s16(t_buf, q0x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q0x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q1x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q1x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q2x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q2x2s16.val[1]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q3x2s16.val[0]);
+        t_buf += 8;
+        vst1q_s16(t_buf, q3x2s16.val[1]);
+        t_buf += 8;
+    }
+    return;
+}
+
+static INLINE void idct32_bands_end_1st_pass(
+        int16_t *out,
+        int16x8_t q2s16,
+        int16x8_t q3s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16,
+        int16x8_t q10s16,
+        int16x8_t q11s16,
+        int16x8_t q12s16,
+        int16x8_t q13s16,
+        int16x8_t q14s16,
+        int16x8_t q15s16) {
+    int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+    STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
+    STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
+    STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
+    q2s16 = vaddq_s16(q10s16, q1s16);
+    q3s16 = vaddq_s16(q11s16, q0s16);
+    q4s16 = vsubq_s16(q11s16, q0s16);
+    q5s16 = vsubq_s16(q10s16, q1s16);
+
+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
+    STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
+    STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
+    q2s16 = vaddq_s16(q12s16, q1s16);
+    q3s16 = vaddq_s16(q13s16, q0s16);
+    q4s16 = vsubq_s16(q13s16, q0s16);
+    q5s16 = vsubq_s16(q12s16, q1s16);
+
+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
+    STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
+    STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
+
+    LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
+    q2s16 = vaddq_s16(q14s16, q1s16);
+    q3s16 = vaddq_s16(q15s16, q0s16);
+    q4s16 = vsubq_s16(q15s16, q0s16);
+    q5s16 = vsubq_s16(q14s16, q1s16);
+
+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
+    STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
+
+    LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
+    STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
+    return;
+}
+
+static INLINE void idct32_bands_end_2nd_pass(
+        int16_t *out,
+        uint8_t *dest,
+        int stride,
+        int16x8_t q2s16,
+        int16x8_t q3s16,
+        int16x8_t q6s16,
+        int16x8_t q7s16,
+        int16x8_t q8s16,
+        int16x8_t q9s16,
+        int16x8_t q10s16,
+        int16x8_t q11s16,
+        int16x8_t q12s16,
+        int16x8_t q13s16,
+        int16x8_t q14s16,
+        int16x8_t q15s16) {
+    uint8_t *r6  = dest + 31 * stride;
+    uint8_t *r7  = dest/* +  0 * stride*/;
+    uint8_t *r9  = dest + 15 * stride;
+    uint8_t *r10 = dest + 16 * stride;
+    int str2 = stride << 1;
+    int16x8_t q0s16, q1s16, q4s16, q5s16;
+
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
+    q2s16 = vaddq_s16(q10s16, q1s16);
+    q3s16 = vaddq_s16(q11s16, q0s16);
+    q4s16 = vsubq_s16(q11s16, q0s16);
+    q5s16 = vsubq_s16(q10s16, q1s16);
+
+    LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
+    q2s16 = vaddq_s16(q12s16, q1s16);
+    q3s16 = vaddq_s16(q13s16, q0s16);
+    q4s16 = vsubq_s16(q13s16, q0s16);
+    q5s16 = vsubq_s16(q12s16, q1s16);
+
+    LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+    r10 += str2; r9 -= str2;
+
+    LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    r7 += str2; r6 -= str2;
+
+    LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
+    q2s16 = vaddq_s16(q14s16, q1s16);
+    q3s16 = vaddq_s16(q15s16, q0s16);
+    q4s16 = vsubq_s16(q15s16, q0s16);
+    q5s16 = vsubq_s16(q14s16, q1s16);
+
+    LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
+    q8s16 = vaddq_s16(q4s16, q1s16);
+    q9s16 = vaddq_s16(q5s16, q0s16);
+    q6s16 = vsubq_s16(q5s16, q0s16);
+    q7s16 = vsubq_s16(q4s16, q1s16);
+    STORE_COMBINE_CENTER_RESULTS(r10, r9);
+
+    LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
+    q4s16 = vaddq_s16(q2s16, q1s16);
+    q5s16 = vaddq_s16(q3s16, q0s16);
+    q6s16 = vsubq_s16(q3s16, q0s16);
+    q7s16 = vsubq_s16(q2s16, q1s16);
+    STORE_COMBINE_EXTREME_RESULTS(r7, r6);
+    return;
+}
+
+void vp9_idct32x32_1024_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int stride) {
+    int i, idct32_pass_loop;
+    int16_t trans_buf[32 * 8];
+    int16_t pass1[32 * 32];
+    int16_t pass2[32 * 32];
+    int16_t *out;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+
+    for (idct32_pass_loop = 0, out = pass1;
+         idct32_pass_loop < 2;
+         idct32_pass_loop++,
+         input = pass1,  // the input of pass2 is the result of pass1
+         out = pass2) {
+        for (i = 0;
+             i < 4; i++,
+             input += 32 * 8, out += 8) {  // idct32_bands_loop
+            idct32_transpose_pair(input, trans_buf);
+
+            // -----------------------------------------
+            // BLOCK A: 16-19,28-31
+            // -----------------------------------------
+            // generate 16,17,30,31
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(0, 1, 31)
+            DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(31, 17, 15)
+            DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
+            // part of stage 2
+            q4s16 = vaddq_s16(q0s16, q1s16);
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q6s16 = vaddq_s16(q2s16, q3s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
+
+            // generate 18,19,28,29
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(15, 9, 23)
+            DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(23, 25, 7)
+            DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
+            // part of stage 2
+            q13s16 = vsubq_s16(q3s16, q2s16);
+            q3s16 = vaddq_s16(q3s16, q2s16);
+            q14s16 = vsubq_s16(q1s16, q0s16);
+            q2s16 = vaddq_s16(q1s16, q0s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
+            // part of stage 4
+            q8s16 = vaddq_s16(q4s16, q2s16);
+            q9s16 = vaddq_s16(q5s16, q0s16);
+            q10s16 = vaddq_s16(q7s16, q1s16);
+            q15s16 = vaddq_s16(q6s16, q3s16);
+            q13s16 = vsubq_s16(q5s16, q0s16);
+            q14s16 = vsubq_s16(q7s16, q1s16);
+            STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
+            STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
+            STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
+            // part of stage 4
+            q13s16 = vsubq_s16(q4s16, q2s16);
+            q14s16 = vsubq_s16(q6s16, q3s16);
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
+            STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
+
+            // -----------------------------------------
+            // BLOCK B: 20-23,24-27
+            // -----------------------------------------
+            // generate 20,21,26,27
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(7, 5, 27)
+            DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(27, 21, 11)
+            DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
+            // part of stage 2
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+
+            // generate 22,23,24,25
+            // part of stage 1
+            LOAD_FROM_TRANSPOSED(11, 13, 19)
+            DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(19, 29, 3)
+            DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
+            // part of stage 2
+            q14s16 = vsubq_s16(q4s16, q5s16);
+            q5s16  = vaddq_s16(q4s16, q5s16);
+            q13s16 = vsubq_s16(q6s16, q7s16);
+            q6s16  = vaddq_s16(q6s16, q7s16);
+            // part of stage 3
+            DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
+            // part of stage 4
+            q10s16 = vaddq_s16(q7s16, q1s16);
+            q11s16 = vaddq_s16(q5s16, q0s16);
+            q12s16 = vaddq_s16(q6s16, q2s16);
+            q15s16 = vaddq_s16(q4s16, q3s16);
+            // part of stage 6
+            LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
+            q8s16 = vaddq_s16(q14s16, q11s16);
+            q9s16 = vaddq_s16(q13s16, q10s16);
+            q13s16 = vsubq_s16(q13s16, q10s16);
+            q11s16 = vsubq_s16(q14s16, q11s16);
+            STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
+            LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
+            q8s16  = vsubq_s16(q9s16, q12s16);
+            q10s16 = vaddq_s16(q14s16, q15s16);
+            q14s16 = vsubq_s16(q14s16, q15s16);
+            q12s16 = vaddq_s16(q9s16, q12s16);
+            STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
+            // part of stage 7
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
+            q13s16 = q11s16;
+            q14s16 = q8s16;
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
+            // part of stage 4
+            q14s16 = vsubq_s16(q5s16, q0s16);
+            q13s16 = vsubq_s16(q6s16, q2s16);
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
+            q14s16 = vsubq_s16(q7s16, q1s16);
+            q13s16 = vsubq_s16(q4s16, q3s16);
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
+            // part of stage 6
+            LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
+            q8s16 = vaddq_s16(q14s16, q1s16);
+            q9s16 = vaddq_s16(q13s16, q6s16);
+            q13s16 = vsubq_s16(q13s16, q6s16);
+            q1s16 = vsubq_s16(q14s16, q1s16);
+            STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
+            LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
+            q14s16 = vsubq_s16(q8s16, q5s16);
+            q10s16 = vaddq_s16(q8s16, q5s16);
+            q11s16 = vaddq_s16(q9s16, q0s16);
+            q0s16 = vsubq_s16(q9s16, q0s16);
+            STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
+            // part of stage 7
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
+            STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
+            DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
+                                                         &q1s16, &q0s16);
+            STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
+
+            // -----------------------------------------
+            // BLOCK C: 8-10,11-15
+            // -----------------------------------------
+            // generate 8,9,14,15
+            // part of stage 2
+            LOAD_FROM_TRANSPOSED(3, 2, 30)
+            DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(30, 18, 14)
+            DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
+            // part of stage 3
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 4
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
+
+            // generate 10,11,12,13
+            // part of stage 2
+            LOAD_FROM_TRANSPOSED(14, 10, 22)
+            DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(22, 26, 6)
+            DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
+            // part of stage 3
+            q14s16 = vsubq_s16(q4s16, q5s16);
+            q5s16 = vaddq_s16(q4s16, q5s16);
+            q13s16 = vsubq_s16(q6s16, q7s16);
+            q6s16 = vaddq_s16(q6s16, q7s16);
+            // part of stage 4
+            DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
+            // part of stage 5
+            q8s16 = vaddq_s16(q0s16, q5s16);
+            q9s16 = vaddq_s16(q1s16, q7s16);
+            q13s16 = vsubq_s16(q1s16, q7s16);
+            q14s16 = vsubq_s16(q3s16, q4s16);
+            q10s16 = vaddq_s16(q3s16, q4s16);
+            q15s16 = vaddq_s16(q2s16, q6s16);
+            STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
+            STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
+            // part of stage 6
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+            STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
+            q13s16 = vsubq_s16(q0s16, q5s16);
+            q14s16 = vsubq_s16(q2s16, q6s16);
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+            STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
+
+            // -----------------------------------------
+            // BLOCK D: 0-3,4-7
+            // -----------------------------------------
+            // generate 4,5,6,7
+            // part of stage 3
+            LOAD_FROM_TRANSPOSED(6, 4, 28)
+            DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
+            LOAD_FROM_TRANSPOSED(28, 20, 12)
+            DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
+            // part of stage 4
+            q13s16 = vsubq_s16(q0s16, q1s16);
+            q0s16 = vaddq_s16(q0s16, q1s16);
+            q14s16 = vsubq_s16(q2s16, q3s16);
+            q2s16 = vaddq_s16(q2s16, q3s16);
+            // part of stage 5
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
+
+            // generate 0,1,2,3
+            // part of stage 4
+            LOAD_FROM_TRANSPOSED(12, 0, 16)
+            DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
+            LOAD_FROM_TRANSPOSED(16, 8, 24)
+            DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
+            // part of stage 5
+            q4s16 = vaddq_s16(q7s16, q6s16);
+            q7s16 = vsubq_s16(q7s16, q6s16);
+            q6s16 = vsubq_s16(q5s16, q14s16);
+            q5s16 = vaddq_s16(q5s16, q14s16);
+            // part of stage 6
+            q8s16 = vaddq_s16(q4s16, q2s16);
+            q9s16 = vaddq_s16(q5s16, q3s16);
+            q10s16 = vaddq_s16(q6s16, q1s16);
+            q11s16 = vaddq_s16(q7s16, q0s16);
+            q12s16 = vsubq_s16(q7s16, q0s16);
+            q13s16 = vsubq_s16(q6s16, q1s16);
+            q14s16 = vsubq_s16(q5s16, q3s16);
+            q15s16 = vsubq_s16(q4s16, q2s16);
+            // part of stage 7
+            LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
+            q2s16 = vaddq_s16(q8s16, q1s16);
+            q3s16 = vaddq_s16(q9s16, q0s16);
+            q4s16 = vsubq_s16(q9s16, q0s16);
+            q5s16 = vsubq_s16(q8s16, q1s16);
+            LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
+            q8s16 = vaddq_s16(q4s16, q1s16);
+            q9s16 = vaddq_s16(q5s16, q0s16);
+            q6s16 = vsubq_s16(q5s16, q0s16);
+            q7s16 = vsubq_s16(q4s16, q1s16);
+
+            if (idct32_pass_loop == 0) {
+                idct32_bands_end_1st_pass(out,
+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+            } else {
+                idct32_bands_end_2nd_pass(out, dest, stride,
+                         q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
+                         q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
+                dest += 8;
+            }
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm
index 72e933eee96..72e933eee96 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct32x32_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c
new file mode 100644
index 00000000000..7c8a930b645
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.c
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct4x4_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d6u8;
+    uint32x2_t d2u32 = vdup_n_u32(0);
+    uint16x8_t q8u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 4);
+
+    q0s16 = vdupq_n_s16(a1);
+
+    // dc_only_idct_add
+    d1 = d2 = dest;
+    for (i = 0; i < 2; i++) {
+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0);
+        d1 += dest_stride;
+        d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1);
+        d1 += dest_stride;
+
+        q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16),
+                         vreinterpret_u8_u32(d2u32));
+        d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+
+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0);
+        d2 += dest_stride;
+        vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1);
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm
index 0d4a721c4d3..0d4a721c4d3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.c
new file mode 100644
index 00000000000..dc91e0f3027
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.c
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+void vp9_idct4x4_16_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d26u8, d27u8;
+    uint32x2_t d26u32, d27u32;
+    uint16x8_t q8u16, q9u16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16;
+    int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16;
+    int16x8_t q8s16, q9s16, q13s16, q14s16;
+    int32x4_t q1s32, q13s32, q14s32, q15s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+    uint8_t *d;
+    int16_t cospi_8_64 = 15137;
+    int16_t cospi_16_64 = 11585;
+    int16_t cospi_24_64 = 6270;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_low_s16(q9s16);
+    d19s16 = vget_high_s16(q9s16);
+
+    d0x2s16 = vtrn_s16(d16s16, d17s16);
+    d1x2s16 = vtrn_s16(d18s16, d19s16);
+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+    d20s16 = vdup_n_s16(cospi_8_64);
+    d21s16 = vdup_n_s16(cospi_16_64);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                        vreinterpretq_s32_s16(q9s16));
+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+    d22s16 = vdup_n_s16(cospi_24_64);
+
+    // stage 1
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, d22s16);
+    q1s32  = vmull_s16(d17s16, d20s16);
+    q13s32 = vmull_s16(d23s16, d21s16);
+    q14s32 = vmull_s16(d24s16, d21s16);
+
+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q1s32,  14);
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+
+    // stage 2
+    q8s16 = vaddq_s16(q13s16, q14s16);
+    q9s16 = vsubq_s16(q13s16, q14s16);
+
+    d16s16 = vget_low_s16(q8s16);
+    d17s16 = vget_high_s16(q8s16);
+    d18s16 = vget_high_s16(q9s16);  // vswp d18 d19
+    d19s16 = vget_low_s16(q9s16);
+
+    d0x2s16 = vtrn_s16(d16s16, d17s16);
+    d1x2s16 = vtrn_s16(d18s16, d19s16);
+    q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]);
+    q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
+                        vreinterpretq_s32_s16(q9s16));
+    d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0]));
+    d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+    d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1]));
+
+    // do the transform on columns
+    // stage 1
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, d22s16);
+    q1s32  = vmull_s16(d17s16, d20s16);
+    q13s32 = vmull_s16(d23s16, d21s16);
+    q14s32 = vmull_s16(d24s16, d21s16);
+
+    q15s32 = vmlsl_s16(q15s32, d19s16, d20s16);
+    q1s32  = vmlal_s16(q1s32,  d19s16, d22s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q1s32,  14);
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+
+    // stage 2
+    q8s16 = vaddq_s16(q13s16, q14s16);
+    q9s16 = vsubq_s16(q13s16, q14s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d = dest;
+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0);
+    d += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1);
+    d += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1);
+    d += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    d = dest;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1);
+    d += dest_stride;
+    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0);
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm
index 00283fc8d78..00283fc8d78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct4x4_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c
new file mode 100644
index 00000000000..24c29fb77f6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "vp9/common/vp9_idct.h"
+
+void vp9_idct8x8_1_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8x8_t d2u8, d3u8, d30u8, d31u8;
+    uint64x1_t d2u64, d3u64, d4u64, d5u64;
+    uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16;
+    int16x8_t q0s16;
+    uint8_t *d1, *d2;
+    int16_t i, a1, cospi_16_64 = 11585;
+    int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+    out = dct_const_round_shift(out * cospi_16_64);
+    a1 = ROUND_POWER_OF_TWO(out, 5);
+
+    q0s16 = vdupq_n_s16(a1);
+    q0u16 = vreinterpretq_u16_s16(q0s16);
+
+    d1 = d2 = dest;
+    for (i = 0; i < 2; i++) {
+        d2u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d4u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+        d5u64 = vld1_u64((const uint64_t *)d1);
+        d1 += dest_stride;
+
+        q9u16  = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64));
+        q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64));
+        q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64));
+        q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64));
+
+        d2u8  = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d3u8  = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+        d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8));
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm
index 421d202d403..421d202d403 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_1_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
new file mode 100644
index 00000000000..2b3c1ce6065
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.c
@@ -0,0 +1,547 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_28_64 = 3196;
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16 = vqrshrn_n_s32(q2s32, 14);
+    d9s16 = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16 = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16 = vaddq_s16(q0s16, q7s16);
+    *q9s16 = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+void vp9_idct8x8_64_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 16);
+    q11s16 = vld1q_s16(input + 24);
+    q12s16 = vld1q_s16(input + 32);
+    q13s16 = vld1q_s16(input + 40);
+    q14s16 = vld1q_s16(input + 48);
+    q15s16 = vld1q_s16(input + 56);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    d1 = d2 = dest;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+
+    q8s16 = q12s16;
+    q9s16 = q13s16;
+    q10s16 = q14s16;
+    q11s16 = q15s16;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    return;
+}
+
+void vp9_idct8x8_12_add_neon(
+        int16_t *input,
+        uint8_t *dest,
+        int dest_stride) {
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+    int32x4_t q9s32, q10s32, q11s32, q12s32;
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 16);
+    q11s16 = vld1q_s16(input + 24);
+    q12s16 = vld1q_s16(input + 32);
+    q13s16 = vld1q_s16(input + 40);
+    q14s16 = vld1q_s16(input + 48);
+    q15s16 = vld1q_s16(input + 56);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    // First transform rows
+    // stage 1
+    q0s16 = vdupq_n_s16(cospi_28_64 * 2);
+    q1s16 = vdupq_n_s16(cospi_4_64 * 2);
+
+    q4s16 = vqrdmulhq_s16(q9s16, q0s16);
+
+    q0s16 = vdupq_n_s16(-cospi_20_64 * 2);
+
+    q7s16 = vqrdmulhq_s16(q9s16, q1s16);
+
+    q1s16 = vdupq_n_s16(cospi_12_64 * 2);
+
+    q5s16 = vqrdmulhq_s16(q11s16, q0s16);
+
+    q0s16 = vdupq_n_s16(cospi_16_64 * 2);
+
+    q6s16 = vqrdmulhq_s16(q11s16, q1s16);
+
+    // stage 2 & stage 3 - even half
+    q1s16 = vdupq_n_s16(cospi_24_64 * 2);
+
+    q9s16 = vqrdmulhq_s16(q8s16, q0s16);
+
+    q0s16 = vdupq_n_s16(cospi_8_64 * 2);
+
+    q13s16 = vqrdmulhq_s16(q10s16, q1s16);
+
+    q15s16 = vqrdmulhq_s16(q10s16, q0s16);
+
+    // stage 3 -odd half
+    q0s16 = vaddq_s16(q9s16, q15s16);
+    q1s16 = vaddq_s16(q9s16, q13s16);
+    q2s16 = vsubq_s16(q9s16, q13s16);
+    q3s16 = vsubq_s16(q9s16, q15s16);
+
+    // stage 2 - odd half
+    q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16 = vaddq_s16(q4s16, q5s16);
+    q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16 = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(q13s16);
+    d27s16 = vget_high_s16(q13s16);
+    d28s16 = vget_low_s16(q14s16);
+    d29s16 = vget_high_s16(q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+    q9s32 = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32 = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    // stage 4
+    q8s16 = vaddq_s16(q0s16, q7s16);
+    q9s16 = vaddq_s16(q1s16, q6s16);
+    q10s16 = vaddq_s16(q2s16, q5s16);
+    q11s16 = vaddq_s16(q3s16, q4s16);
+    q12s16 = vsubq_s16(q3s16, q4s16);
+    q13s16 = vsubq_s16(q2s16, q5s16);
+    q14s16 = vsubq_s16(q1s16, q6s16);
+    q15s16 = vsubq_s16(q0s16, q7s16);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+               &q12s16, &q13s16, &q14s16, &q15s16);
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    d1 = d2 = dest;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+
+    q8s16 = q12s16;
+    q9s16 = q13s16;
+    q10s16 = q14s16;
+    q11s16 = q15s16;
+
+    d0u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d1u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d2u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+    d3u64 = vld1_u64((uint64_t *)d1);
+    d1 += dest_stride;
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                     vreinterpret_u8_u64(d0u64));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                     vreinterpret_u8_u64(d1u64));
+    q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                      vreinterpret_u8_u64(d2u64));
+    q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                      vreinterpret_u8_u64(d3u64));
+
+    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+    d2 += dest_stride;
+    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+    d2 += dest_stride;
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm
index ab5bb69202a..ab5bb69202a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_idct8x8_add_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
deleted file mode 100644
index 2f326e24c9e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.asm
+++ /dev/null
@@ -1,237 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht4x4_16_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Parallel 1D IDCT on all the columns of a 4x4 16bits data matrix which are
-    ; loaded in d16-d19. d0 must contain cospi_8_64. d1 must contain
-    ; cospi_16_64. d2 must contain cospi_24_64. The output will be stored back
-    ; into d16-d19 registers. This macro will touch q10- q15 registers and use
-    ; them as buffer during calculation.
-    MACRO
-    IDCT4x4_1D
-    ; stage 1
-    vadd.s16    d23, d16, d18   ; (input[0] + input[2])
-    vsub.s16    d24, d16, d18   ; (input[0] - input[2])
-
-    vmull.s16   q15, d17, d2    ; input[1] * cospi_24_64
-    vmull.s16   q10, d17, d0    ; input[1] * cospi_8_64
-    vmull.s16   q13, d23, d1    ; (input[0] + input[2]) * cospi_16_64
-    vmull.s16   q14, d24, d1    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16   q15, d19, d0    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlal.s16   q10, d19, d2    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d26, q13, #14
-    vqrshrn.s32 d27, q14, #14
-    vqrshrn.s32 d29, q15, #14
-    vqrshrn.s32 d28, q10, #14
-
-    ; stage 2
-    ; output[0] = step[0] + step[3];
-    ; output[1] = step[1] + step[2];
-    ; output[3] = step[0] - step[3];
-    ; output[2] = step[1] - step[2];
-    vadd.s16    q8,  q13, q14
-    vsub.s16    q9,  q13, q14
-    vswp        d18, d19
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 4x4 16bits data matrix which
-    ; loaded in d16-d19. d3 must contain sinpi_1_9. d4 must contain sinpi_2_9.
-    ; d5 must contain sinpi_4_9. d6 must contain sinpi_3_9. The output will be
-    ; stored back into d16-d19 registers. This macro will touch q11,q12,q13,
-    ; q14,q15 registers and use them as buffer during calculation.
-    MACRO
-    IADST4x4_1D
-    vmull.s16   q10, d3, d16    ; s0 = sinpi_1_9 * x0
-    vmull.s16   q11, d4, d16    ; s1 = sinpi_2_9 * x0
-    vmull.s16   q12, d6, d17    ; s2 = sinpi_3_9 * x1
-    vmull.s16   q13, d5, d18    ; s3 = sinpi_4_9 * x2
-    vmull.s16   q14, d3, d18    ; s4 = sinpi_1_9 * x2
-    vmovl.s16   q15, d16        ; expand x0 from 16 bit to 32 bit
-    vaddw.s16   q15, q15, d19   ; x0 + x3
-    vmull.s16   q8, d4, d19     ; s5 = sinpi_2_9 * x3
-    vsubw.s16   q15, q15, d18   ; s7 = x0 + x3 - x2
-    vmull.s16   q9, d5, d19     ; s6 = sinpi_4_9 * x3
-
-    vadd.s32    q10, q10, q13   ; x0 = s0 + s3 + s5
-    vadd.s32    q10, q10, q8
-    vsub.s32    q11, q11, q14   ; x1 = s1 - s4 - s6
-    vdup.32     q8, r0          ; duplicate sinpi_3_9
-    vsub.s32    q11, q11, q9
-    vmul.s32    q15, q15, q8    ; x2 = sinpi_3_9 * s7
-
-    vadd.s32    q13, q10, q12   ; s0 = x0 + x3
-    vadd.s32    q10, q10, q11   ; x0 + x1
-    vadd.s32    q14, q11, q12   ; s1 = x1 + x3
-    vsub.s32    q10, q10, q12   ; s3 = x0 + x1 - x3
-
-    ; dct_const_round_shift
-    vqrshrn.s32 d16, q13, #14
-    vqrshrn.s32 d17, q14, #14
-    vqrshrn.s32 d18, q15, #14
-    vqrshrn.s32 d19, q10, #14
-    MEND
-
-    ; Generate cosine constants in d6 - d8 for the IDCT
-    MACRO
-    GENERATE_COSINE_CONSTANTS
-    ; cospi_8_64 = 15137 = 0x3b21
-    mov         r0, #0x3b00
-    add         r0, #0x21
-    ; cospi_16_64 = 11585 = 0x2d41
-    mov         r3, #0x2d00
-    add         r3, #0x41
-    ; cospi_24_64 = 6270 = 0x187e
-    mov         r12, #0x1800
-    add         r12, #0x7e
-
-    ; generate constant vectors
-    vdup.16     d0, r0          ; duplicate cospi_8_64
-    vdup.16     d1, r3          ; duplicate cospi_16_64
-    vdup.16     d2, r12         ; duplicate cospi_24_64
-    MEND
-
-    ; Generate sine constants in d1 - d4 for the IADST.
-    MACRO
-    GENERATE_SINE_CONSTANTS
-    ; sinpi_1_9 = 5283 = 0x14A3
-    mov         r0, #0x1400
-    add         r0, #0xa3
-    ; sinpi_2_9 = 9929 = 0x26C9
-    mov         r3, #0x2600
-    add         r3, #0xc9
-    ; sinpi_4_9 = 15212 = 0x3B6C
-    mov         r12, #0x3b00
-    add         r12, #0x6c
-
-    ; generate constant vectors
-    vdup.16     d3, r0          ; duplicate sinpi_1_9
-
-    ; sinpi_3_9 = 13377 = 0x3441
-    mov         r0, #0x3400
-    add         r0, #0x41
-
-    vdup.16     d4, r3          ; duplicate sinpi_2_9
-    vdup.16     d5, r12         ; duplicate sinpi_4_9
-    vdup.16     q3, r0          ; duplicate sinpi_3_9
-    MEND
-
-    ; Transpose a 4x4 16bits data matrix. Datas are loaded in d16-d19.
-    MACRO
-    TRANSPOSE4X4
-    vtrn.16     d16, d17
-    vtrn.16     d18, d19
-    vtrn.32     q8, q9
-    MEND
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht4x4_16_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht4x4_16_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16    {q8,q9}, [r0]!
-
-    ; transpose the input data
-    TRANSPOSE4X4
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IDCT4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-idct_iadst
-    ; generate constants
-    GENERATE_COSINE_CONSTANTS
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IDCT4x4_1D
-
-    b end_vp9_iht4x4_16_add_neon
-
-iadst_iadst
-    ; generate constants
-    GENERATE_SINE_CONSTANTS
-
-    ; first transform rows
-    IADST4x4_1D
-
-    ; transpose the matrix
-    TRANSPOSE4X4
-
-    ; then transform columns
-    IADST4x4_1D
-
-end_vp9_iht4x4_16_add_neon
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4)
-    vrshr.s16   q8, q8, #4
-    vrshr.s16   q9, q9, #4
-
-    vld1.32     {d26[0]}, [r1], r2
-    vld1.32     {d26[1]}, [r1], r2
-    vld1.32     {d27[0]}, [r1], r2
-    vld1.32     {d27[1]}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]
-    vaddw.u8    q8, q8, d26
-    vaddw.u8    q9, q9, d27
-
-    ; clip_pixel
-    vqmovun.s16 d26, q8
-    vqmovun.s16 d27, q9
-
-    ; do the stores in reverse order with negative post-increment, by changing
-    ; the sign of the stride
-    rsb         r2, r2, #0
-    vst1.32     {d27[1]}, [r1], r2
-    vst1.32     {d27[0]}, [r1], r2
-    vst1.32     {d26[1]}, [r1], r2
-    vst1.32     {d26[0]}, [r1]  ; no post-increment
-    bx          lr
-    ENDP  ; |vp9_iht4x4_16_add_neon|
-
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
new file mode 100644
index 00000000000..1761fada2fa
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t sinpi_1_9 = 0x14a3;
+static int16_t sinpi_2_9 = 0x26c9;
+static int16_t sinpi_3_9 = 0x3441;
+static int16_t sinpi_4_9 = 0x3b6c;
+static int16_t cospi_8_64 = 0x3b21;
+static int16_t cospi_16_64 = 0x2d41;
+static int16_t cospi_24_64 = 0x187e;
+
+static INLINE void TRANSPOSE4X4(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int32x4_t q8s32, q9s32;
+    int16x4x2_t d0x2s16, d1x2s16;
+    int32x4x2_t q0x2s32;
+
+    d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
+    d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
+
+    q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
+    q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
+    q0x2s32 = vtrnq_s32(q8s32, q9s32);
+
+    *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
+    *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
+    return;
+}
+
+static INLINE void GENERATE_COSINE_CONSTANTS(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16) {
+    *d0s16 = vdup_n_s16(cospi_8_64);
+    *d1s16 = vdup_n_s16(cospi_16_64);
+    *d2s16 = vdup_n_s16(cospi_24_64);
+    return;
+}
+
+static INLINE void GENERATE_SINE_CONSTANTS(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16) {
+    *d3s16 = vdup_n_s16(sinpi_1_9);
+    *d4s16 = vdup_n_s16(sinpi_2_9);
+    *q3s16 = vdupq_n_s16(sinpi_3_9);
+    *d5s16 = vdup_n_s16(sinpi_4_9);
+    return;
+}
+
+static INLINE void IDCT4x4_1D(
+        int16x4_t *d0s16,
+        int16x4_t *d1s16,
+        int16x4_t *d2s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
+    int16x4_t d26s16, d27s16, d28s16, d29s16;
+    int32x4_t q10s32, q13s32, q14s32, q15s32;
+    int16x8_t q13s16, q14s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    d23s16 = vadd_s16(d16s16, d18s16);
+    d24s16 = vsub_s16(d16s16, d18s16);
+
+    q15s32 = vmull_s16(d17s16, *d2s16);
+    q10s32 = vmull_s16(d17s16, *d0s16);
+    q13s32 = vmull_s16(d23s16, *d1s16);
+    q14s32 = vmull_s16(d24s16, *d1s16);
+    q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
+    q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
+
+    d26s16 = vqrshrn_n_s32(q13s32, 14);
+    d27s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d28s16 = vqrshrn_n_s32(q10s32, 14);
+
+    q13s16 = vcombine_s16(d26s16, d27s16);
+    q14s16 = vcombine_s16(d28s16, d29s16);
+    *q8s16 = vaddq_s16(q13s16, q14s16);
+    *q9s16 = vsubq_s16(q13s16, q14s16);
+    *q9s16 = vcombine_s16(vget_high_s16(*q9s16),
+                          vget_low_s16(*q9s16));  // vswp
+    return;
+}
+
+static INLINE void IADST4x4_1D(
+        int16x4_t *d3s16,
+        int16x4_t *d4s16,
+        int16x4_t *d5s16,
+        int16x8_t *q3s16,
+        int16x8_t *q8s16,
+        int16x8_t *q9s16) {
+    int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
+    int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d6s16 = vget_low_s16(*q3s16);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+
+    q10s32 = vmull_s16(*d3s16, d16s16);
+    q11s32 = vmull_s16(*d4s16, d16s16);
+    q12s32 = vmull_s16(d6s16, d17s16);
+    q13s32 = vmull_s16(*d5s16, d18s16);
+    q14s32 = vmull_s16(*d3s16, d18s16);
+    q15s32 = vmovl_s16(d16s16);
+    q15s32 = vaddw_s16(q15s32, d19s16);
+    q8s32  = vmull_s16(*d4s16, d19s16);
+    q15s32 = vsubw_s16(q15s32, d18s16);
+    q9s32  = vmull_s16(*d5s16, d19s16);
+
+    q10s32 = vaddq_s32(q10s32, q13s32);
+    q10s32 = vaddq_s32(q10s32, q8s32);
+    q11s32 = vsubq_s32(q11s32, q14s32);
+    q8s32  = vdupq_n_s32(sinpi_3_9);
+    q11s32 = vsubq_s32(q11s32, q9s32);
+    q15s32 = vmulq_s32(q15s32, q8s32);
+
+    q13s32 = vaddq_s32(q10s32, q12s32);
+    q10s32 = vaddq_s32(q10s32, q11s32);
+    q14s32 = vaddq_s32(q11s32, q12s32);
+    q10s32 = vsubq_s32(q10s32, q12s32);
+
+    d16s16 = vqrshrn_n_s32(q13s32, 14);
+    d17s16 = vqrshrn_n_s32(q14s32, 14);
+    d18s16 = vqrshrn_n_s32(q15s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+
+    *q8s16 = vcombine_s16(d16s16, d17s16);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+    return;
+}
+
+void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    uint8x8_t d26u8, d27u8;
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
+    uint32x2_t d26u32, d27u32;
+    int16x8_t q3s16, q8s16, q9s16;
+    uint16x8_t q8u16, q9u16;
+
+    d26u32 = d27u32 = vdup_n_u32(0);
+
+    q8s16 = vld1q_s16(input);
+    q9s16 = vld1q_s16(input + 8);
+
+    TRANSPOSE4X4(&q8s16, &q9s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate constants
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      case 2:  // idct_iadst
+        // generate constantsyy
+        GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate constants
+        GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
+
+        // first transform rows
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+
+        // transpose the matrix
+        TRANSPOSE4X4(&q8s16, &q9s16);
+
+        // then transform columns
+        IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 4);
+    q9s16 = vrshrq_n_s16(q9s16, 4);
+
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
+    dest += dest_stride;
+    d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
+    dest += dest_stride;
+    d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
+
+    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
+    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
+
+    d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+    d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
+    dest -= dest_stride;
+    vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
deleted file mode 100644
index b41f5661b80..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.asm
+++ /dev/null
@@ -1,698 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-    EXPORT  |vp9_iht8x8_64_add_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Generate IADST constants in r0 - r12 for the IADST.
-    MACRO
-    GENERATE_IADST_CONSTANTS
-    ; generate  cospi_2_64  = 16305
-    mov             r0, #0x3f00
-    add             r0, #0xb1
-
-    ; generate cospi_30_64 = 1606
-    mov             r1, #0x600
-    add             r1, #0x46
-
-    ; generate cospi_10_64 = 14449
-    mov             r2, #0x3800
-    add             r2, #0x71
-
-    ; generate cospi_22_64 = 7723
-    mov             r3, #0x1e00
-    add             r3, #0x2b
-
-    ; generate cospi_18_64 = 10394
-    mov             r4, #0x2800
-    add             r4, #0x9a
-
-    ; generate cospi_14_64 = 12665
-    mov             r5, #0x3100
-    add             r5, #0x79
-
-    ; generate cospi_26_64 = 4756
-    mov             r6, #0x1200
-    add             r6, #0x94
-
-    ; generate cospi_6_64  = 15679
-    mov             r7, #0x3d00
-    add             r7, #0x3f
-
-    ; generate cospi_8_64  = 15137
-    mov             r8, #0x3b00
-    add             r8, #0x21
-
-    ; generate cospi_24_64 = 6270
-    mov             r9, #0x1800
-    add             r9, #0x7e
-
-    ; generate 0
-    mov             r10, #0
-
-    ; generate  cospi_16_64 = 11585
-    mov             r12, #0x2d00
-    add             r12, #0x41
-    MEND
-
-    ; Generate IDCT constants in r3 - r9 for the IDCT.
-    MACRO
-    GENERATE_IDCT_CONSTANTS
-    ; generate  cospi_28_64 = 3196
-    mov             r3, #0x0c00
-    add             r3, #0x7c
-
-    ; generate cospi_4_64  = 16069
-    mov             r4, #0x3e00
-    add             r4, #0xc5
-
-    ; generate cospi_12_64 = 13623
-    mov             r5, #0x3500
-    add             r5, #0x37
-
-    ; generate cospi_20_64 = 9102
-    mov             r6, #0x2300
-    add             r6, #0x8e
-
-    ; generate cospi_16_64 = 11585
-    mov             r7, #0x2d00
-    add             r7, #0x41
-
-    ; generate cospi_24_64 = 6270
-    mov             r8, #0x1800
-    add             r8, #0x7e
-
-    ; generate cospi_8_64 = 15137
-    mov             r9, #0x3b00
-    add             r9, #0x21
-    MEND
-
-    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
-    MACRO
-    TRANSPOSE8X8
-    vswp            d17, d24
-    vswp            d23, d30
-    vswp            d21, d28
-    vswp            d19, d26
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.32         q12, q14
-    vtrn.32         q13, q15
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.16         q12, q13
-    vtrn.16         q14, q15
-    MEND
-
-    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
-    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
-    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
-    ; registers and use them as buffer during calculation.
-    MACRO
-    IDCT8x8_1D
-    ; stage 1
-    vdup.16         d0, r3                    ; duplicate cospi_28_64
-    vdup.16         d1, r4                    ; duplicate cospi_4_64
-    vdup.16         d2, r5                    ; duplicate cospi_12_64
-    vdup.16         d3, r6                    ; duplicate cospi_20_64
-
-    ; input[1] * cospi_28_64
-    vmull.s16       q2, d18, d0
-    vmull.s16       q3, d19, d0
-
-    ; input[5] * cospi_12_64
-    vmull.s16       q5, d26, d2
-    vmull.s16       q6, d27, d2
-
-    ; input[1]*cospi_28_64-input[7]*cospi_4_64
-    vmlsl.s16       q2, d30, d1
-    vmlsl.s16       q3, d31, d1
-
-    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
-    vmlsl.s16       q5, d22, d3
-    vmlsl.s16       q6, d23, d3
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d8, q2, #14               ; >> 14
-    vqrshrn.s32     d9, q3, #14               ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; input[1] * cospi_4_64
-    vmull.s16       q2, d18, d1
-    vmull.s16       q3, d19, d1
-
-    ; input[5] * cospi_20_64
-    vmull.s16       q9, d26, d3
-    vmull.s16       q13, d27, d3
-
-    ; input[1]*cospi_4_64+input[7]*cospi_28_64
-    vmlal.s16       q2, d30, d0
-    vmlal.s16       q3, d31, d0
-
-    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
-    vmlal.s16       q9, d22, d2
-    vmlal.s16       q13, d23, d2
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d14, q2, #14              ; >> 14
-    vqrshrn.s32     d15, q3, #14              ; >> 14
-
-    ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7                    ; duplicate cospi_16_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q9, #14              ; >> 14
-    vqrshrn.s32     d13, q13, #14             ; >> 14
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q2, d16, d0
-    vmull.s16       q3, d17, d0
-
-    ; input[0] * cospi_16_64
-    vmull.s16       q13, d16, d0
-    vmull.s16       q15, d17, d0
-
-    ; (input[0] + input[2]) * cospi_16_64
-    vmlal.s16       q2,  d24, d0
-    vmlal.s16       q3, d25, d0
-
-    ; (input[0] - input[2]) * cospi_16_64
-    vmlsl.s16       q13, d24, d0
-    vmlsl.s16       q15, d25, d0
-
-    vdup.16         d0, r8                    ; duplicate cospi_24_64
-    vdup.16         d1, r9                    ; duplicate cospi_8_64
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d18, q2, #14              ; >> 14
-    vqrshrn.s32     d19, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d22, q13, #14             ; >> 14
-    vqrshrn.s32     d23, q15, #14             ; >> 14
-
-    ; input[1] * cospi_24_64
-    vmull.s16       q2, d20, d0
-    vmull.s16       q3, d21, d0
-
-    ; input[1] * cospi_8_64
-    vmull.s16       q8, d20, d1
-    vmull.s16       q12, d21, d1
-
-    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vmlsl.s16       q2, d28, d1
-    vmlsl.s16       q3, d29, d1
-
-    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
-    vmlal.s16       q8, d28, d0
-    vmlal.s16       q12, d29, d0
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d26, q2, #14              ; >> 14
-    vqrshrn.s32     d27, q3, #14              ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d30, q8, #14              ; >> 14
-    vqrshrn.s32     d31, q12, #14             ; >> 14
-
-    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
-    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
-    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
-    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
-
-    ; stage 3 -odd half
-    vdup.16         d16, r7                   ; duplicate cospi_16_64
-
-    ; stage 2 - odd half
-    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
-    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
-    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
-    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q9, d28, d16
-    vmull.s16       q10, d29, d16
-
-    ; step2[6] * cospi_16_64
-    vmull.s16       q11, d28, d16
-    vmull.s16       q12, d29, d16
-
-    ; (step2[6] - step2[5]) * cospi_16_64
-    vmlsl.s16       q9, d26, d16
-    vmlsl.s16       q10, d27, d16
-
-    ; (step2[5] + step2[6]) * cospi_16_64
-    vmlal.s16       q11, d26, d16
-    vmlal.s16       q12, d27, d16
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d10, q9, #14              ; >> 14
-    vqrshrn.s32     d11, q10, #14             ; >> 14
-
-    ; dct_const_round_shift(input_dc * cospi_16_64)
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q12, #14             ; >> 14
-
-    ; stage 4
-    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
-    MEND
-
-    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
-    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
-    ; output will be stored back into q8-q15 registers. This macro will touch
-    ; q0 - q7 registers and use them as buffer during calculation.
-    MACRO
-    IADST8X8_1D
-    vdup.16         d14, r0                   ; duplicate cospi_2_64
-    vdup.16         d15, r1                   ; duplicate cospi_30_64
-
-    ; cospi_2_64  * x0
-    vmull.s16       q1, d30, d14
-    vmull.s16       q2, d31, d14
-
-    ; cospi_30_64 * x0
-    vmull.s16       q3, d30, d15
-    vmull.s16       q4, d31, d15
-
-    vdup.16         d30, r4                   ; duplicate cospi_18_64
-    vdup.16         d31, r5                   ; duplicate cospi_14_64
-
-    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-    vmlal.s16       q1, d16, d15
-    vmlal.s16       q2, d17, d15
-
-    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
-    vmlsl.s16       q3, d16, d14
-    vmlsl.s16       q4, d17, d14
-
-    ; cospi_18_64 * x4
-    vmull.s16       q5, d22, d30
-    vmull.s16       q6, d23, d30
-
-    ; cospi_14_64 * x4
-    vmull.s16       q7, d22, d31
-    vmull.s16       q8, d23, d31
-
-    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-    vmlal.s16       q5, d24, d31
-    vmlal.s16       q6, d25, d31
-
-    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
-    vmlsl.s16       q7, d24, d30
-    vmlsl.s16       q8, d25, d30
-
-    ; (s0 + s4)
-    vadd.s32        q11, q1, q5
-    vadd.s32        q12, q2, q6
-
-    vdup.16         d0, r2                   ; duplicate cospi_10_64
-    vdup.16         d1, r3                   ; duplicate cospi_22_64
-
-    ; (s0 - s4)
-    vsub.s32        q1, q1, q5
-    vsub.s32        q2, q2, q6
-
-    ; x0 = dct_const_round_shift(s0 + s4);
-    vqrshrn.s32     d22, q11, #14             ; >> 14
-    vqrshrn.s32     d23, q12, #14             ; >> 14
-
-    ; (s1 + s5)
-    vadd.s32        q12, q3, q7
-    vadd.s32        q15, q4, q8
-
-    ; (s1 - s5)
-    vsub.s32        q3, q3, q7
-    vsub.s32        q4, q4, q8
-
-    ; x4 = dct_const_round_shift(s0 - s4);
-    vqrshrn.s32     d2, q1, #14               ; >> 14
-    vqrshrn.s32     d3, q2, #14               ; >> 14
-
-    ; x1 = dct_const_round_shift(s1 + s5);
-    vqrshrn.s32     d24, q12, #14             ; >> 14
-    vqrshrn.s32     d25, q15, #14             ; >> 14
-
-    ; x5 = dct_const_round_shift(s1 - s5);
-    vqrshrn.s32     d6, q3, #14               ; >> 14
-    vqrshrn.s32     d7, q4, #14               ; >> 14
-
-    ; cospi_10_64 * x2
-    vmull.s16       q4, d26, d0
-    vmull.s16       q5, d27, d0
-
-    ; cospi_22_64 * x2
-    vmull.s16       q2, d26, d1
-    vmull.s16       q6, d27, d1
-
-    vdup.16         d30, r6                   ; duplicate cospi_26_64
-    vdup.16         d31, r7                   ; duplicate cospi_6_64
-
-    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-    vmlal.s16       q4, d20, d1
-    vmlal.s16       q5, d21, d1
-
-    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-    vmlsl.s16       q2, d20, d0
-    vmlsl.s16       q6, d21, d0
-
-    ; cospi_26_64 * x6
-    vmull.s16       q0, d18, d30
-    vmull.s16       q13, d19, d30
-
-    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-    vmlal.s16       q0, d28, d31
-    vmlal.s16       q13, d29, d31
-
-    ; cospi_6_64  * x6
-    vmull.s16       q10, d18, d31
-    vmull.s16       q9, d19, d31
-
-    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
-    vmlsl.s16       q10, d28, d30
-    vmlsl.s16       q9, d29, d30
-
-    ; (s3 + s7)
-    vadd.s32        q14, q2, q10
-    vadd.s32        q15, q6, q9
-
-    ; (s3 - s7)
-    vsub.s32        q2, q2, q10
-    vsub.s32        q6, q6, q9
-
-    ; x3 = dct_const_round_shift(s3 + s7);
-    vqrshrn.s32     d28, q14, #14             ; >> 14
-    vqrshrn.s32     d29, q15, #14             ; >> 14
-
-    ; x7 = dct_const_round_shift(s3 - s7);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q6, #14               ; >> 14
-
-    ; (s2 + s6)
-    vadd.s32        q9, q4, q0
-    vadd.s32        q10, q5, q13
-
-    ; (s2 - s6)
-    vsub.s32        q4, q4, q0
-    vsub.s32        q5, q5, q13
-
-    vdup.16         d30, r8                   ; duplicate cospi_8_64
-    vdup.16         d31, r9                   ; duplicate cospi_24_64
-
-    ; x2 = dct_const_round_shift(s2 + s6);
-    vqrshrn.s32     d18, q9, #14              ; >> 14
-    vqrshrn.s32     d19, q10, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s2 - s6);
-    vqrshrn.s32     d8, q4, #14               ; >> 14
-    vqrshrn.s32     d9, q5, #14               ; >> 14
-
-    ; cospi_8_64  * x4
-    vmull.s16       q5, d2, d30
-    vmull.s16       q6, d3, d30
-
-    ; cospi_24_64 * x4
-    vmull.s16       q7, d2, d31
-    vmull.s16       q0, d3, d31
-
-    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-    vmlal.s16       q5, d6, d31
-    vmlal.s16       q6, d7, d31
-
-    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-    vmlsl.s16       q7, d6, d30
-    vmlsl.s16       q0, d7, d30
-
-    ; cospi_8_64  * x7
-    vmull.s16       q1, d4, d30
-    vmull.s16       q3, d5, d30
-
-    ; cospi_24_64 * x7
-    vmull.s16       q10, d4, d31
-    vmull.s16       q2, d5, d31
-
-    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-    vmlsl.s16       q1, d8, d31
-    vmlsl.s16       q3, d9, d31
-
-    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
-    vmlal.s16       q10, d8, d30
-    vmlal.s16       q2, d9, d30
-
-    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
-
-    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
-
-    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
-
-    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
-
-    ; (s4 + s6)
-    vadd.s32        q14, q5, q1
-    vadd.s32        q15, q6, q3
-
-    ; (s4 - s6)
-    vsub.s32        q5, q5, q1
-    vsub.s32        q6, q6, q3
-
-    ; x4 = dct_const_round_shift(s4 + s6);
-    vqrshrn.s32     d18, q14, #14             ; >> 14
-    vqrshrn.s32     d19, q15, #14             ; >> 14
-
-    ; x6 = dct_const_round_shift(s4 - s6);
-    vqrshrn.s32     d10, q5, #14              ; >> 14
-    vqrshrn.s32     d11, q6, #14              ; >> 14
-
-    ; (s5 + s7)
-    vadd.s32        q1, q7, q10
-    vadd.s32        q3, q0, q2
-
-    ; (s5 - s7))
-    vsub.s32        q7, q7, q10
-    vsub.s32        q0, q0, q2
-
-    ; x5 = dct_const_round_shift(s5 + s7);
-    vqrshrn.s32     d28, q1, #14               ; >> 14
-    vqrshrn.s32     d29, q3, #14               ; >> 14
-
-    ; x7 = dct_const_round_shift(s5 - s7);
-    vqrshrn.s32     d14, q7, #14              ; >> 14
-    vqrshrn.s32     d15, q0, #14              ; >> 14
-
-    vdup.16         d30, r12                  ; duplicate cospi_16_64
-
-    ; cospi_16_64 * x2
-    vmull.s16       q2, d22, d30
-    vmull.s16       q3, d23, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q13, d22, d30
-    vmull.s16       q1, d23, d30
-
-    ; cospi_16_64 * x2 + cospi_16_64  * x3;
-    vmlal.s16       q2, d24, d30
-    vmlal.s16       q3, d25, d30
-
-    ; cospi_16_64 * x2 - cospi_16_64  * x3;
-    vmlsl.s16       q13, d24, d30
-    vmlsl.s16       q1, d25, d30
-
-    ; x2 = dct_const_round_shift(s2);
-    vqrshrn.s32     d4, q2, #14               ; >> 14
-    vqrshrn.s32     d5, q3, #14               ; >> 14
-
-    ;x3 = dct_const_round_shift(s3);
-    vqrshrn.s32     d24, q13, #14             ; >> 14
-    vqrshrn.s32     d25, q1, #14              ; >> 14
-
-    ; cospi_16_64 * x6
-    vmull.s16       q13, d10, d30
-    vmull.s16       q1, d11, d30
-
-    ; cospi_6_64  * x6
-    vmull.s16       q11, d10, d30
-    vmull.s16       q0, d11, d30
-
-    ; cospi_16_64 * x6 + cospi_16_64  * x7;
-    vmlal.s16       q13, d14, d30
-    vmlal.s16       q1, d15, d30
-
-    ; cospi_16_64 * x6 - cospi_16_64  * x7;
-    vmlsl.s16       q11, d14, d30
-    vmlsl.s16       q0, d15, d30
-
-    ; x6 = dct_const_round_shift(s6);
-    vqrshrn.s32     d20, q13, #14             ; >> 14
-    vqrshrn.s32     d21, q1, #14              ; >> 14
-
-    ;x7 = dct_const_round_shift(s7);
-    vqrshrn.s32     d12, q11, #14             ; >> 14
-    vqrshrn.s32     d13, q0, #14              ; >> 14
-
-    vdup.16         q5, r10                   ; duplicate 0
-
-    vsub.s16        q9, q5, q9                ; output[1] = -x4;
-    vsub.s16        q11, q5, q2               ; output[3] = -x2;
-    vsub.s16        q13, q5, q6               ; output[5] = -x7;
-    vsub.s16        q15, q5, q4               ; output[7] = -x1;
-    MEND
-
-
-    AREA     Block, CODE, READONLY ; name this block of code
-;void vp9_iht8x8_64_add_neon(int16_t *input, uint8_t *dest,
-;                               int dest_stride, int tx_type)
-;
-; r0  int16_t input
-; r1  uint8_t *dest
-; r2  int dest_stride
-; r3  int tx_type)
-; This function will only handle tx_type of 1,2,3.
-|vp9_iht8x8_64_add_neon| PROC
-
-    ; load the inputs into d16-d19
-    vld1.s16        {q8,q9}, [r0]!
-    vld1.s16        {q10,q11}, [r0]!
-    vld1.s16        {q12,q13}, [r0]!
-    vld1.s16        {q14,q15}, [r0]!
-
-    push            {r0-r10}
-    vpush           {d8-d15}
-
-    ; transpose the input data
-    TRANSPOSE8X8
-
-    ; decide the type of transform
-    cmp         r3, #2
-    beq         idct_iadst
-    cmp         r3, #3
-    beq         iadst_iadst
-
-iadst_idct
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; first transform rows
-    IDCT8x8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; then transform columns
-    IADST8X8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-idct_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; generate IDCT constants
-    GENERATE_IDCT_CONSTANTS
-
-    ; then transform columns
-    IDCT8x8_1D
-
-    b end_vp9_iht8x8_64_add_neon
-
-iadst_iadst
-    ; generate IADST constants
-    GENERATE_IADST_CONSTANTS
-
-    ; first transform rows
-    IADST8X8_1D
-
-    ; transpose the matrix
-    TRANSPOSE8X8
-
-    ; then transform columns
-    IADST8X8_1D
-
-end_vp9_iht8x8_64_add_neon
-    vpop           {d8-d15}
-    pop            {r0-r10}
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
-    vrshr.s16       q8, q8, #5
-    vrshr.s16       q9, q9, #5
-    vrshr.s16       q10, q10, #5
-    vrshr.s16       q11, q11, #5
-    vrshr.s16       q12, q12, #5
-    vrshr.s16       q13, q13, #5
-    vrshr.s16       q14, q14, #5
-    vrshr.s16       q15, q15, #5
-
-    ; save dest pointer
-    mov             r0, r1
-
-    ; load destination data
-    vld1.64         {d0}, [r1], r2
-    vld1.64         {d1}, [r1], r2
-    vld1.64         {d2}, [r1], r2
-    vld1.64         {d3}, [r1], r2
-    vld1.64         {d4}, [r1], r2
-    vld1.64         {d5}, [r1], r2
-    vld1.64         {d6}, [r1], r2
-    vld1.64         {d7}, [r1]
-
-    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
-    vaddw.u8        q8, q8, d0
-    vaddw.u8        q9, q9, d1
-    vaddw.u8        q10, q10, d2
-    vaddw.u8        q11, q11, d3
-    vaddw.u8        q12, q12, d4
-    vaddw.u8        q13, q13, d5
-    vaddw.u8        q14, q14, d6
-    vaddw.u8        q15, q15, d7
-
-    ; clip_pixel
-    vqmovun.s16     d0, q8
-    vqmovun.s16     d1, q9
-    vqmovun.s16     d2, q10
-    vqmovun.s16     d3, q11
-    vqmovun.s16     d4, q12
-    vqmovun.s16     d5, q13
-    vqmovun.s16     d6, q14
-    vqmovun.s16     d7, q15
-
-    ; store the data
-    vst1.64         {d0}, [r0], r2
-    vst1.64         {d1}, [r0], r2
-    vst1.64         {d2}, [r0], r2
-    vst1.64         {d3}, [r0], r2
-    vst1.64         {d4}, [r0], r2
-    vst1.64         {d5}, [r0], r2
-    vst1.64         {d6}, [r0], r2
-    vst1.64         {d7}, [r0], r2
-    bx          lr
-    ENDP  ; |vp9_iht8x8_64_add_neon|
-
-    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
new file mode 100644
index 00000000000..04b342c3d34
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -0,0 +1,624 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+static int16_t cospi_2_64 = 16305;
+static int16_t cospi_4_64 = 16069;
+static int16_t cospi_6_64 = 15679;
+static int16_t cospi_8_64 = 15137;
+static int16_t cospi_10_64 = 14449;
+static int16_t cospi_12_64 = 13623;
+static int16_t cospi_14_64 = 12665;
+static int16_t cospi_16_64 = 11585;
+static int16_t cospi_18_64 = 10394;
+static int16_t cospi_20_64 = 9102;
+static int16_t cospi_22_64 = 7723;
+static int16_t cospi_24_64 = 6270;
+static int16_t cospi_26_64 = 4756;
+static int16_t cospi_28_64 = 3196;
+static int16_t cospi_30_64 = 1606;
+
+static INLINE void TRANSPOSE8X8(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
+    int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    *q8s16  = vcombine_s16(d16s16, d24s16);  // vswp d17, d24
+    *q9s16  = vcombine_s16(d18s16, d26s16);  // vswp d19, d26
+    *q10s16 = vcombine_s16(d20s16, d28s16);  // vswp d21, d28
+    *q11s16 = vcombine_s16(d22s16, d30s16);  // vswp d23, d30
+    *q12s16 = vcombine_s16(d17s16, d25s16);
+    *q13s16 = vcombine_s16(d19s16, d27s16);
+    *q14s16 = vcombine_s16(d21s16, d29s16);
+    *q15s16 = vcombine_s16(d23s16, d31s16);
+
+    q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
+                        vreinterpretq_s32_s16(*q10s16));
+    q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
+                        vreinterpretq_s32_s16(*q11s16));
+    q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
+                        vreinterpretq_s32_s16(*q14s16));
+    q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
+                        vreinterpretq_s32_s16(*q15s16));
+
+    q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]),   // q8
+                        vreinterpretq_s16_s32(q1x2s32.val[0]));  // q9
+    q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]),   // q10
+                        vreinterpretq_s16_s32(q1x2s32.val[1]));  // q11
+    q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]),   // q12
+                        vreinterpretq_s16_s32(q3x2s32.val[0]));  // q13
+    q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]),   // q14
+                        vreinterpretq_s16_s32(q3x2s32.val[1]));  // q15
+
+    *q8s16  = q0x2s16.val[0];
+    *q9s16  = q0x2s16.val[1];
+    *q10s16 = q1x2s16.val[0];
+    *q11s16 = q1x2s16.val[1];
+    *q12s16 = q2x2s16.val[0];
+    *q13s16 = q2x2s16.val[1];
+    *q14s16 = q3x2s16.val[0];
+    *q15s16 = q3x2s16.val[1];
+    return;
+}
+
+static INLINE void IDCT8x8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
+    int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
+    int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
+
+    d0s16 = vdup_n_s16(cospi_28_64);
+    d1s16 = vdup_n_s16(cospi_4_64);
+    d2s16 = vdup_n_s16(cospi_12_64);
+    d3s16 = vdup_n_s16(cospi_20_64);
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    q2s32 = vmull_s16(d18s16, d0s16);
+    q3s32 = vmull_s16(d19s16, d0s16);
+    q5s32 = vmull_s16(d26s16, d2s16);
+    q6s32 = vmull_s16(d27s16, d2s16);
+
+    q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
+    q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
+    q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
+
+    d8s16  = vqrshrn_n_s32(q2s32, 14);
+    d9s16  = vqrshrn_n_s32(q3s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    q4s16 = vcombine_s16(d8s16, d9s16);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+
+    q2s32 = vmull_s16(d18s16, d1s16);
+    q3s32 = vmull_s16(d19s16, d1s16);
+    q9s32 = vmull_s16(d26s16, d3s16);
+    q13s32 = vmull_s16(d27s16, d3s16);
+
+    q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
+    q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
+    q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
+
+    d14s16 = vqrshrn_n_s32(q2s32, 14);
+    d15s16 = vqrshrn_n_s32(q3s32, 14);
+    d12s16 = vqrshrn_n_s32(q9s32, 14);
+    d13s16 = vqrshrn_n_s32(q13s32, 14);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+    q7s16 = vcombine_s16(d14s16, d15s16);
+
+    d0s16 = vdup_n_s16(cospi_16_64);
+
+    q2s32 = vmull_s16(d16s16, d0s16);
+    q3s32 = vmull_s16(d17s16, d0s16);
+    q13s32 = vmull_s16(d16s16, d0s16);
+    q15s32 = vmull_s16(d17s16, d0s16);
+
+    q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
+    q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
+
+    d0s16 = vdup_n_s16(cospi_24_64);
+    d1s16 = vdup_n_s16(cospi_8_64);
+
+    d18s16 = vqrshrn_n_s32(q2s32, 14);
+    d19s16 = vqrshrn_n_s32(q3s32, 14);
+    d22s16 = vqrshrn_n_s32(q13s32, 14);
+    d23s16 = vqrshrn_n_s32(q15s32, 14);
+    *q9s16  = vcombine_s16(d18s16, d19s16);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q2s32 = vmull_s16(d20s16, d0s16);
+    q3s32 = vmull_s16(d21s16, d0s16);
+    q8s32 = vmull_s16(d20s16, d1s16);
+    q12s32 = vmull_s16(d21s16, d1s16);
+
+    q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
+    q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
+    q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
+    q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
+
+    d26s16 = vqrshrn_n_s32(q2s32, 14);
+    d27s16 = vqrshrn_n_s32(q3s32, 14);
+    d30s16 = vqrshrn_n_s32(q8s32, 14);
+    d31s16 = vqrshrn_n_s32(q12s32, 14);
+    *q13s16 = vcombine_s16(d26s16, d27s16);
+    *q15s16 = vcombine_s16(d30s16, d31s16);
+
+    q0s16 = vaddq_s16(*q9s16, *q15s16);
+    q1s16 = vaddq_s16(*q11s16, *q13s16);
+    q2s16 = vsubq_s16(*q11s16, *q13s16);
+    q3s16 = vsubq_s16(*q9s16, *q15s16);
+
+    *q13s16 = vsubq_s16(q4s16, q5s16);
+    q4s16   = vaddq_s16(q4s16, q5s16);
+    *q14s16 = vsubq_s16(q7s16, q6s16);
+    q7s16   = vaddq_s16(q7s16, q6s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+
+    d16s16 = vdup_n_s16(cospi_16_64);
+
+    q9s32  = vmull_s16(d28s16, d16s16);
+    q10s32 = vmull_s16(d29s16, d16s16);
+    q11s32 = vmull_s16(d28s16, d16s16);
+    q12s32 = vmull_s16(d29s16, d16s16);
+
+    q9s32  = vmlsl_s16(q9s32,  d26s16, d16s16);
+    q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
+    q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
+    q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
+
+    d10s16 = vqrshrn_n_s32(q9s32, 14);
+    d11s16 = vqrshrn_n_s32(q10s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q12s32, 14);
+    q5s16 = vcombine_s16(d10s16, d11s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    *q8s16  = vaddq_s16(q0s16, q7s16);
+    *q9s16  = vaddq_s16(q1s16, q6s16);
+    *q10s16 = vaddq_s16(q2s16, q5s16);
+    *q11s16 = vaddq_s16(q3s16, q4s16);
+    *q12s16 = vsubq_s16(q3s16, q4s16);
+    *q13s16 = vsubq_s16(q2s16, q5s16);
+    *q14s16 = vsubq_s16(q1s16, q6s16);
+    *q15s16 = vsubq_s16(q0s16, q7s16);
+    return;
+}
+
+static INLINE void IADST8X8_1D(
+        int16x8_t *q8s16,
+        int16x8_t *q9s16,
+        int16x8_t *q10s16,
+        int16x8_t *q11s16,
+        int16x8_t *q12s16,
+        int16x8_t *q13s16,
+        int16x8_t *q14s16,
+        int16x8_t *q15s16) {
+    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
+    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
+    int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
+    int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
+    int16x8_t q2s16, q4s16, q5s16, q6s16;
+    int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
+    int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
+
+    d16s16 = vget_low_s16(*q8s16);
+    d17s16 = vget_high_s16(*q8s16);
+    d18s16 = vget_low_s16(*q9s16);
+    d19s16 = vget_high_s16(*q9s16);
+    d20s16 = vget_low_s16(*q10s16);
+    d21s16 = vget_high_s16(*q10s16);
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    d26s16 = vget_low_s16(*q13s16);
+    d27s16 = vget_high_s16(*q13s16);
+    d28s16 = vget_low_s16(*q14s16);
+    d29s16 = vget_high_s16(*q14s16);
+    d30s16 = vget_low_s16(*q15s16);
+    d31s16 = vget_high_s16(*q15s16);
+
+    d14s16 = vdup_n_s16(cospi_2_64);
+    d15s16 = vdup_n_s16(cospi_30_64);
+
+    q1s32 = vmull_s16(d30s16, d14s16);
+    q2s32 = vmull_s16(d31s16, d14s16);
+    q3s32 = vmull_s16(d30s16, d15s16);
+    q4s32 = vmull_s16(d31s16, d15s16);
+
+    d30s16 = vdup_n_s16(cospi_18_64);
+    d31s16 = vdup_n_s16(cospi_14_64);
+
+    q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
+    q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
+    q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
+    q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
+
+    q5s32 = vmull_s16(d22s16, d30s16);
+    q6s32 = vmull_s16(d23s16, d30s16);
+    q7s32 = vmull_s16(d22s16, d31s16);
+    q8s32 = vmull_s16(d23s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
+    q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
+
+    q11s32 = vaddq_s32(q1s32, q5s32);
+    q12s32 = vaddq_s32(q2s32, q6s32);
+    q1s32 = vsubq_s32(q1s32, q5s32);
+    q2s32 = vsubq_s32(q2s32, q6s32);
+
+    d22s16 = vqrshrn_n_s32(q11s32, 14);
+    d23s16 = vqrshrn_n_s32(q12s32, 14);
+    *q11s16 = vcombine_s16(d22s16, d23s16);
+
+    q12s32 = vaddq_s32(q3s32, q7s32);
+    q15s32 = vaddq_s32(q4s32, q8s32);
+    q3s32 = vsubq_s32(q3s32, q7s32);
+    q4s32 = vsubq_s32(q4s32, q8s32);
+
+    d2s16  = vqrshrn_n_s32(q1s32, 14);
+    d3s16  = vqrshrn_n_s32(q2s32, 14);
+    d24s16 = vqrshrn_n_s32(q12s32, 14);
+    d25s16 = vqrshrn_n_s32(q15s32, 14);
+    d6s16  = vqrshrn_n_s32(q3s32, 14);
+    d7s16  = vqrshrn_n_s32(q4s32, 14);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    d0s16 = vdup_n_s16(cospi_10_64);
+    d1s16 = vdup_n_s16(cospi_22_64);
+    q4s32 = vmull_s16(d26s16, d0s16);
+    q5s32 = vmull_s16(d27s16, d0s16);
+    q2s32 = vmull_s16(d26s16, d1s16);
+    q6s32 = vmull_s16(d27s16, d1s16);
+
+    d30s16 = vdup_n_s16(cospi_26_64);
+    d31s16 = vdup_n_s16(cospi_6_64);
+
+    q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
+    q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
+    q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
+    q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
+
+    q0s32 = vmull_s16(d18s16, d30s16);
+    q13s32 = vmull_s16(d19s16, d30s16);
+
+    q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
+    q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
+
+    q10s32 = vmull_s16(d18s16, d31s16);
+    q9s32 = vmull_s16(d19s16, d31s16);
+
+    q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
+    q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
+
+    q14s32 = vaddq_s32(q2s32, q10s32);
+    q15s32 = vaddq_s32(q6s32, q9s32);
+    q2s32 = vsubq_s32(q2s32, q10s32);
+    q6s32 = vsubq_s32(q6s32, q9s32);
+
+    d28s16 = vqrshrn_n_s32(q14s32, 14);
+    d29s16 = vqrshrn_n_s32(q15s32, 14);
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q6s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    q9s32 = vaddq_s32(q4s32, q0s32);
+    q10s32 = vaddq_s32(q5s32, q13s32);
+    q4s32 = vsubq_s32(q4s32, q0s32);
+    q5s32 = vsubq_s32(q5s32, q13s32);
+
+    d30s16 = vdup_n_s16(cospi_8_64);
+    d31s16 = vdup_n_s16(cospi_24_64);
+
+    d18s16 = vqrshrn_n_s32(q9s32, 14);
+    d19s16 = vqrshrn_n_s32(q10s32, 14);
+    d8s16 = vqrshrn_n_s32(q4s32, 14);
+    d9s16 = vqrshrn_n_s32(q5s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q5s32 = vmull_s16(d2s16, d30s16);
+    q6s32 = vmull_s16(d3s16, d30s16);
+    q7s32 = vmull_s16(d2s16, d31s16);
+    q0s32 = vmull_s16(d3s16, d31s16);
+
+    q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
+    q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
+    q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
+
+    q1s32 = vmull_s16(d4s16, d30s16);
+    q3s32 = vmull_s16(d5s16, d30s16);
+    q10s32 = vmull_s16(d4s16, d31s16);
+    q2s32 = vmull_s16(d5s16, d31s16);
+
+    q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
+    q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
+    q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
+    q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
+
+    *q8s16 = vaddq_s16(*q11s16, *q9s16);
+    *q11s16 = vsubq_s16(*q11s16, *q9s16);
+    q4s16 = vaddq_s16(*q12s16, *q14s16);
+    *q12s16 = vsubq_s16(*q12s16, *q14s16);
+
+    q14s32 = vaddq_s32(q5s32, q1s32);
+    q15s32 = vaddq_s32(q6s32, q3s32);
+    q5s32 = vsubq_s32(q5s32, q1s32);
+    q6s32 = vsubq_s32(q6s32, q3s32);
+
+    d18s16 = vqrshrn_n_s32(q14s32, 14);
+    d19s16 = vqrshrn_n_s32(q15s32, 14);
+    d10s16 = vqrshrn_n_s32(q5s32, 14);
+    d11s16 = vqrshrn_n_s32(q6s32, 14);
+    *q9s16 = vcombine_s16(d18s16, d19s16);
+
+    q1s32 = vaddq_s32(q7s32, q10s32);
+    q3s32 = vaddq_s32(q0s32, q2s32);
+    q7s32 = vsubq_s32(q7s32, q10s32);
+    q0s32 = vsubq_s32(q0s32, q2s32);
+
+    d28s16 = vqrshrn_n_s32(q1s32, 14);
+    d29s16 = vqrshrn_n_s32(q3s32, 14);
+    d14s16 = vqrshrn_n_s32(q7s32, 14);
+    d15s16 = vqrshrn_n_s32(q0s32, 14);
+    *q14s16 = vcombine_s16(d28s16, d29s16);
+
+    d30s16 = vdup_n_s16(cospi_16_64);
+
+    d22s16 = vget_low_s16(*q11s16);
+    d23s16 = vget_high_s16(*q11s16);
+    q2s32 = vmull_s16(d22s16, d30s16);
+    q3s32 = vmull_s16(d23s16, d30s16);
+    q13s32 = vmull_s16(d22s16, d30s16);
+    q1s32 = vmull_s16(d23s16, d30s16);
+
+    d24s16 = vget_low_s16(*q12s16);
+    d25s16 = vget_high_s16(*q12s16);
+    q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
+    q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
+    q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
+    q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+
+    d4s16 = vqrshrn_n_s32(q2s32, 14);
+    d5s16 = vqrshrn_n_s32(q3s32, 14);
+    d24s16 = vqrshrn_n_s32(q13s32, 14);
+    d25s16 = vqrshrn_n_s32(q1s32, 14);
+    q2s16 = vcombine_s16(d4s16, d5s16);
+    *q12s16 = vcombine_s16(d24s16, d25s16);
+
+    q13s32 = vmull_s16(d10s16, d30s16);
+    q1s32 = vmull_s16(d11s16, d30s16);
+    q11s32 = vmull_s16(d10s16, d30s16);
+    q0s32 = vmull_s16(d11s16, d30s16);
+
+    q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
+    q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
+    q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
+    q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+
+    d20s16 = vqrshrn_n_s32(q13s32, 14);
+    d21s16 = vqrshrn_n_s32(q1s32, 14);
+    d12s16 = vqrshrn_n_s32(q11s32, 14);
+    d13s16 = vqrshrn_n_s32(q0s32, 14);
+    *q10s16 = vcombine_s16(d20s16, d21s16);
+    q6s16 = vcombine_s16(d12s16, d13s16);
+
+    q5s16 = vdupq_n_s16(0);
+
+    *q9s16  = vsubq_s16(q5s16, *q9s16);
+    *q11s16 = vsubq_s16(q5s16, q2s16);
+    *q13s16 = vsubq_s16(q5s16, q6s16);
+    *q15s16 = vsubq_s16(q5s16, q4s16);
+    return;
+}
+
+void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
+                            int dest_stride, int tx_type) {
+    int i;
+    uint8_t *d1, *d2;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8;
+    uint64x1_t d0u64, d1u64, d2u64, d3u64;
+    int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
+    uint16x8_t q8u16, q9u16, q10u16, q11u16;
+
+    q8s16  = vld1q_s16(input);
+    q9s16  = vld1q_s16(input + 8);
+    q10s16 = vld1q_s16(input + 8 * 2);
+    q11s16 = vld1q_s16(input + 8 * 3);
+    q12s16 = vld1q_s16(input + 8 * 4);
+    q13s16 = vld1q_s16(input + 8 * 5);
+    q14s16 = vld1q_s16(input + 8 * 6);
+    q15s16 = vld1q_s16(input + 8 * 7);
+
+    TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                 &q12s16, &q13s16, &q14s16, &q15s16);
+
+    switch (tx_type) {
+      case 0:  // idct_idct is not supported. Fall back to C
+        vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
+        return;
+        break;
+      case 1:  // iadst_idct
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // first transform rows
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 2:  // idct_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // generate IDCT constants
+        // GENERATE_IDCT_CONSTANTS
+
+        // then transform columns
+        IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                   &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      case 3:  // iadst_iadst
+        // generate IADST constants
+        // GENERATE_IADST_CONSTANTS
+
+        // first transform rows
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // transpose the matrix
+        TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
+                     &q12s16, &q13s16, &q14s16, &q15s16);
+
+        // then transform columns
+        IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
+                    &q12s16, &q13s16, &q14s16, &q15s16);
+        break;
+      default:  // iadst_idct
+        assert(0);
+        break;
+    }
+
+    q8s16 = vrshrq_n_s16(q8s16, 5);
+    q9s16 = vrshrq_n_s16(q9s16, 5);
+    q10s16 = vrshrq_n_s16(q10s16, 5);
+    q11s16 = vrshrq_n_s16(q11s16, 5);
+    q12s16 = vrshrq_n_s16(q12s16, 5);
+    q13s16 = vrshrq_n_s16(q13s16, 5);
+    q14s16 = vrshrq_n_s16(q14s16, 5);
+    q15s16 = vrshrq_n_s16(q15s16, 5);
+
+    for (d1 = d2 = dest, i = 0; i < 2; i++) {
+        if (i != 0) {
+            q8s16 = q12s16;
+            q9s16 = q13s16;
+            q10s16 = q14s16;
+            q11s16 = q15s16;
+        }
+
+        d0u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d1u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d2u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+        d3u64 = vld1_u64((uint64_t *)d1);
+        d1 += dest_stride;
+
+        q8u16  = vaddw_u8(vreinterpretq_u16_s16(q8s16),
+                          vreinterpret_u8_u64(d0u64));
+        q9u16  = vaddw_u8(vreinterpretq_u16_s16(q9s16),
+                          vreinterpret_u8_u64(d1u64));
+        q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
+                          vreinterpret_u8_u64(d2u64));
+        q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
+                          vreinterpret_u8_u64(d3u64));
+
+        d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
+        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
+        d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
+        d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
+
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
+        d2 += dest_stride;
+        vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
+        d2 += dest_stride;
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
index bc6a17cd16f..c69ee1009f5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,46 +8,172 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <arm_neon.h>
+
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+static INLINE void vp9_loop_filter_neon_16(
+        uint8x16_t qblimit,  // blimit
+        uint8x16_t qlimit,   // limit
+        uint8x16_t qthresh,  // thresh
+        uint8x16_t q3,       // p3
+        uint8x16_t q4,       // p2
+        uint8x16_t q5,       // p1
+        uint8x16_t q6,       // p0
+        uint8x16_t q7,       // q0
+        uint8x16_t q8,       // q1
+        uint8x16_t q9,       // q2
+        uint8x16_t q10,      // q3
+        uint8x16_t *q5r,     // p1
+        uint8x16_t *q6r,     // p0
+        uint8x16_t *q7r,     // q0
+        uint8x16_t *q8r) {   // q1
+    uint8x16_t q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
+    int16x8_t q2s16, q11s16;
+    uint16x8_t q4u16;
+    int8x16_t q0s8, q1s8, q2s8, q11s8, q12s8, q13s8;
+    int8x8_t d2s8, d3s8;
+
+    q11u8 = vabdq_u8(q3, q4);
+    q12u8 = vabdq_u8(q4, q5);
+    q13u8 = vabdq_u8(q5, q6);
+    q14u8 = vabdq_u8(q8, q7);
+    q3 = vabdq_u8(q9, q8);
+    q4 = vabdq_u8(q10, q9);
+
+    q11u8 = vmaxq_u8(q11u8, q12u8);
+    q12u8 = vmaxq_u8(q13u8, q14u8);
+    q3 = vmaxq_u8(q3, q4);
+    q15u8 = vmaxq_u8(q11u8, q12u8);
+
+    q9 = vabdq_u8(q6, q7);
+
+    // vp8_hevmask
+    q13u8 = vcgtq_u8(q13u8, qthresh);
+    q14u8 = vcgtq_u8(q14u8, qthresh);
+    q15u8 = vmaxq_u8(q15u8, q3);
+
+    q2u8 = vabdq_u8(q5, q8);
+    q9 = vqaddq_u8(q9, q9);
+
+    q15u8 = vcgeq_u8(qlimit, q15u8);
+
+    // vp8_filter() function
+    // convert to signed
+    q10 = vdupq_n_u8(0x80);
+    q8 = veorq_u8(q8, q10);
+    q7 = veorq_u8(q7, q10);
+    q6 = veorq_u8(q6, q10);
+    q5 = veorq_u8(q5, q10);
+
+    q2u8 = vshrq_n_u8(q2u8, 1);
+    q9 = vqaddq_u8(q9, q2u8);
+
+    q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
+                     vget_low_s8(vreinterpretq_s8_u8(q6)));
+    q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
+                      vget_high_s8(vreinterpretq_s8_u8(q6)));
+
+    q9 = vcgeq_u8(qblimit, q9);
+
+    q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5),
+                    vreinterpretq_s8_u8(q8));
+
+    q14u8 = vorrq_u8(q13u8, q14u8);
+
+    q4u16 = vdupq_n_u16(3);
+    q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
+    q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
+
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
+    q15u8 = vandq_u8(q15u8, q9);
+
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+    q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
+    q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
+
+    q4 = vdupq_n_u8(3);
+    q9 = vdupq_n_u8(4);
+    // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
+    d2s8 = vqmovn_s16(q2s16);
+    d3s8 = vqmovn_s16(q11s16);
+    q1s8 = vcombine_s8(d2s8, d3s8);
+    q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
+    q1s8 = vreinterpretq_s8_u8(q1u8);
+
+    q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q4));
+    q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
+    q2s8 = vshrq_n_s8(q2s8, 3);
+    q1s8 = vshrq_n_s8(q1s8, 3);
+
+    q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
+    q0s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
+
+    q1s8 = vrshrq_n_s8(q1s8, 1);
+    q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
+
+    q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
+    q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
+
+    *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q10);
+    *q7r = veorq_u8(vreinterpretq_u8_s8(q0s8),  q10);
+    *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q10);
+    *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q10);
+    return;
+}
+
+void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */,
                                     const uint8_t *blimit0,
                                     const uint8_t *limit0,
                                     const uint8_t *thresh0,
                                     const uint8_t *blimit1,
                                     const uint8_t *limit1,
                                     const uint8_t *thresh1) {
-  vp9_lpf_horizontal_8(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_horizontal_8(s + 8, p, blimit1, limit1, thresh1, 1);
-}
+    uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1;
+    uint8x16_t qblimit, qlimit, qthresh;
+    uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8;
 
-void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
+    dblimit0 = vld1_u8(blimit0);
+    dlimit0 = vld1_u8(limit0);
+    dthresh0 = vld1_u8(thresh0);
+    dblimit1 = vld1_u8(blimit1);
+    dlimit1 = vld1_u8(limit1);
+    dthresh1 = vld1_u8(thresh1);
+    qblimit = vcombine_u8(dblimit0, dblimit1);
+    qlimit = vcombine_u8(dlimit0, dlimit1);
+    qthresh = vcombine_u8(dthresh0, dthresh1);
 
-void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
-                                  const uint8_t *blimit0,
-                                  const uint8_t *limit0,
-                                  const uint8_t *thresh0,
-                                  const uint8_t *blimit1,
-                                  const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
-  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
-}
+    s -= (p << 2);
+
+    q3u8 = vld1q_u8(s);
+    s += p;
+    q4u8 = vld1q_u8(s);
+    s += p;
+    q5u8 = vld1q_u8(s);
+    s += p;
+    q6u8 = vld1q_u8(s);
+    s += p;
+    q7u8 = vld1q_u8(s);
+    s += p;
+    q8u8 = vld1q_u8(s);
+    s += p;
+    q9u8 = vld1q_u8(s);
+    s += p;
+    q10u8 = vld1q_u8(s);
+
+    vp9_loop_filter_neon_16(qblimit, qlimit, qthresh,
+                            q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8,
+                            &q5u8, &q6u8, &q7u8, &q8u8);
 
-void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
-                                   const uint8_t *blimit,
-                                   const uint8_t *limit,
-                                   const uint8_t *thresh) {
-  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
-  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+    s -= (p * 5);
+    vst1q_u8(s, q5u8);
+    s += p;
+    vst1q_u8(s, q6u8);
+    s += p;
+    vst1q_u8(s, q7u8);
+    s += p;
+    vst1q_u8(s, q8u8);
+    return;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
index 5b8ec20287d..5b8ec20287d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
new file mode 100644
index 00000000000..fd9db6187c6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon.c
@@ -0,0 +1,274 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_loop_filter_neon(
+        uint8x8_t dblimit,    // flimit
+        uint8x8_t dlimit,     // limit
+        uint8x8_t dthresh,    // thresh
+        uint8x8_t d3u8,       // p3
+        uint8x8_t d4u8,       // p2
+        uint8x8_t d5u8,       // p1
+        uint8x8_t d6u8,       // p0
+        uint8x8_t d7u8,       // q0
+        uint8x8_t d16u8,      // q1
+        uint8x8_t d17u8,      // q2
+        uint8x8_t d18u8,      // q3
+        uint8x8_t *d4ru8,     // p1
+        uint8x8_t *d5ru8,     // p0
+        uint8x8_t *d6ru8,     // q0
+        uint8x8_t *d7ru8) {   // q1
+    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
+    int16x8_t q12s16;
+    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d3u8  = vabd_u8(d17u8, d16u8);
+    d4u8  = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+    d3u8  = vmax_u8(d3u8,  d4u8);
+    d23u8 = vmax_u8(d19u8, d20u8);
+
+    d17u8 = vabd_u8(d6u8, d7u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+    d22u8 = vcgt_u8(d22u8, dthresh);
+    d23u8 = vmax_u8(d23u8, d3u8);
+
+    d28u8 = vabd_u8(d5u8, d16u8);
+    d17u8 = vqadd_u8(d17u8, d17u8);
+
+    d23u8 = vcge_u8(dlimit, d23u8);
+
+    d18u8 = vdup_n_u8(0x80);
+    d5u8  = veor_u8(d5u8,  d18u8);
+    d6u8  = veor_u8(d6u8,  d18u8);
+    d7u8  = veor_u8(d7u8,  d18u8);
+    d16u8 = veor_u8(d16u8, d18u8);
+
+    d28u8 = vshr_n_u8(d28u8, 1);
+    d17u8 = vqadd_u8(d17u8, d28u8);
+
+    d19u8 = vdup_n_u8(3);
+
+    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
+                    vreinterpret_s8_u8(d6u8));
+
+    d17u8 = vcge_u8(dblimit, d17u8);
+
+    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
+                     vreinterpret_s8_u8(d16u8));
+
+    d22u8 = vorr_u8(d21u8, d22u8);
+
+    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
+    d23u8 = vand_u8(d23u8, d17u8);
+
+    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));
+
+    d17u8 = vdup_n_u8(4);
+
+    d27s8 = vqmovn_s16(q12s16);
+    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
+    d27s8 = vreinterpret_s8_u8(d27u8);
+
+    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
+    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
+    d28s8 = vshr_n_s8(d28s8, 3);
+    d27s8 = vshr_n_s8(d27s8, 3);
+
+    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
+    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);
+
+    d27s8 = vrshr_n_s8(d27s8, 1);
+    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));
+
+    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
+    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);
+
+    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
+    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
+    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
+    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
+    return;
+}
+
+void vp9_lpf_horizontal_4_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vp9_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d4u8, &d5u8, &d6u8, &d7u8);
+
+        s -= (pitch * 5);
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+        s += pitch;
+        vst1_u8(s, d6u8);
+        s += pitch;
+        vst1_u8(s, d7u8);
+    }
+    return;
+}
+
+void vp9_lpf_vertical_4_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i, pitch8;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+
+    if (count == 0)  // end_vp9_lf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    pitch8 = pitch * 8;
+    for (i = 0; i < count; i++, src += pitch8) {
+        s = src - (i + 1) * 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                      vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                      vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                      vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                      vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d4u8, &d5u8, &d6u8, &d7u8);
+
+        d4Result.val[0] = d4u8;
+        d4Result.val[1] = d5u8;
+        d4Result.val[2] = d6u8;
+        d4Result.val[3] = d7u8;
+
+        src -= 2;
+        vst4_lane_u8(src, d4Result, 0);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 1);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 2);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 3);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 4);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 5);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 6);
+        src += pitch;
+        vst4_lane_u8(src, d4Result, 7);
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
new file mode 100644
index 00000000000..7738e0d3abc
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm
@@ -0,0 +1,277 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_lpf_horizontal_4_neon|
+    EXPORT  |vp9_lpf_vertical_4_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_horizontal_4_neon(uint8_t *s,
+;                                int p /* pitch */,
+;                                const uint8_t *blimit,
+;                                const uint8_t *limit,
+;                                const uint8_t *thresh,
+;                                int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_lpf_horizontal_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
+    ldr         r12, [sp, #8]              ; load count
+    ldr         r2, [sp, #4]               ; load thresh
+    add         r1, r1, r1                 ; double pitch
+
+    cmp         r12, #0
+    beq         end_vp9_lf_h_edge
+
+    vld1.8      {d1[]}, [r3]               ; duplicate *limit
+    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
+
+count_lf_h_loop
+    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
+    add         r3, r2, r1, lsr #1         ; set to 3 lines down
+
+    vld1.u8     {d3}, [r2@64], r1          ; p3
+    vld1.u8     {d4}, [r3@64], r1          ; p2
+    vld1.u8     {d5}, [r2@64], r1          ; p1
+    vld1.u8     {d6}, [r3@64], r1          ; p0
+    vld1.u8     {d7}, [r2@64], r1          ; q0
+    vld1.u8     {d16}, [r3@64], r1         ; q1
+    vld1.u8     {d17}, [r2@64]             ; q2
+    vld1.u8     {d18}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vp9_loop_filter_neon
+
+    vst1.u8     {d4}, [r2@64], r1          ; store op1
+    vst1.u8     {d5}, [r3@64], r1          ; store op0
+    vst1.u8     {d6}, [r2@64], r1          ; store oq0
+    vst1.u8     {d7}, [r3@64], r1          ; store oq1
+
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         count_lf_h_loop
+
+end_vp9_lf_h_edge
+    pop         {pc}
+    ENDP        ; |vp9_lpf_horizontal_4_neon|
+
+; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
+; works on 16 iterations at a time.
+; TODO(fgalligan): See about removing the count code as this function is only
+; called with a count of 1.
+;
+; void vp9_lpf_vertical_4_neon(uint8_t *s,
+;                              int p /* pitch */,
+;                              const uint8_t *blimit,
+;                              const uint8_t *limit,
+;                              const uint8_t *thresh,
+;                              int count)
+;
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+; sp+4  int count
+|vp9_lpf_vertical_4_neon| PROC
+    push        {lr}
+
+    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
+    ldr         r12, [sp, #8]             ; load count
+    vld1.8      {d1[]}, [r3]              ; duplicate *limit
+
+    ldr         r3, [sp, #4]              ; load thresh
+    sub         r2, r0, #4                ; move s pointer down by 4 columns
+    cmp         r12, #0
+    beq         end_vp9_lf_v_edge
+
+    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
+
+count_lf_v_loop
+    vld1.u8     {d3}, [r2], r1             ; load s data
+    vld1.u8     {d4}, [r2], r1
+    vld1.u8     {d5}, [r2], r1
+    vld1.u8     {d6}, [r2], r1
+    vld1.u8     {d7}, [r2], r1
+    vld1.u8     {d16}, [r2], r1
+    vld1.u8     {d17}, [r2], r1
+    vld1.u8     {d18}, [r2]
+
+    ;transpose to 8x16 matrix
+    vtrn.32     d3, d7
+    vtrn.32     d4, d16
+    vtrn.32     d5, d17
+    vtrn.32     d6, d18
+
+    vtrn.16     d3, d5
+    vtrn.16     d4, d6
+    vtrn.16     d7, d17
+    vtrn.16     d16, d18
+
+    vtrn.8      d3, d4
+    vtrn.8      d5, d6
+    vtrn.8      d7, d16
+    vtrn.8      d17, d18
+
+    bl          vp9_loop_filter_neon
+
+    sub         r0, r0, #2
+
+    ;store op1, op0, oq0, oq1
+    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
+    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
+    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
+    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
+    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
+    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
+    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
+    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
+
+    add         r0, r0, r1, lsl #3         ; s += pitch * 8
+    subs        r12, r12, #1
+    subne       r2, r0, #4                 ; move s pointer down by 4 columns
+    bne         count_lf_v_loop
+
+end_vp9_lf_v_edge
+    pop         {pc}
+    ENDP        ; |vp9_lpf_vertical_4_neon|
+
+; void vp9_loop_filter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. The function does not use
+; registers d8-d15.
+;
+; Inputs:
+; r0-r3, r12 PRESERVE
+; d0    blimit
+; d1    limit
+; d2    thresh
+; d3    p3
+; d4    p2
+; d5    p1
+; d6    p0
+; d7    q0
+; d16   q1
+; d17   q2
+; d18   q3
+;
+; Outputs:
+; d4    op1
+; d5    op0
+; d6    oq0
+; d7    oq1
+|vp9_loop_filter_neon| PROC
+    ; filter_mask
+    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
+    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
+    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
+    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
+    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
+    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
+    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
+
+    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
+
+    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
+
+    vmov.u8     d18, #0x80
+
+    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
+
+    ; hevmask
+    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
+
+    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
+    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
+
+    veor        d7, d7, d18                 ; qs0
+
+    vcge.u8     d23, d1, d23                ; abs(m1) > limit
+
+    ; filter() function
+    ; convert to signed
+
+    vshr.u8     d28, d28, #1                ; a = a / 2
+    veor        d6, d6, d18                 ; ps0
+
+    veor        d5, d5, d18                 ; ps1
+    vqadd.u8    d17, d17, d28               ; a = b + a
+
+    veor        d16, d16, d18               ; qs1
+
+    vmov.u8     d19, #3
+
+    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
+
+    vcge.u8     d17, d0, d17                ; a > blimit
+
+    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
+    vorr        d22, d21, d22               ; hevmask
+
+    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
+
+    vand        d27, d27, d22               ; filter &= hev
+    vand        d23, d23, d17               ; filter_mask
+
+    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d17, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d27, q12
+
+    vand        d27, d27, d23               ; filter &= mask
+
+    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
+    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
+    vshr.s8     d28, d28, #3                ; filter2 >>= 3
+    vshr.s8     d27, d27, #3                ; filter1 >>= 3
+
+    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
+    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
+
+    veor        d6, d26, d18                ; *oq0 = u^0x80
+
+    vbic        d27, d27, d22               ; filter &= ~hev
+
+    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
+    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
+
+    veor        d5, d19, d18                ; *op0 = u^0x80
+    veor        d4, d21, d18                ; *op1 = u^0x80
+    veor        d7, d20, d18                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_neon|
+
+    END
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
new file mode 100644
index 00000000000..33068a8a203
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon.c
@@ -0,0 +1,453 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+
+static INLINE void vp9_mbloop_filter_neon(
+        uint8x8_t dblimit,   // mblimit
+        uint8x8_t dlimit,    // limit
+        uint8x8_t dthresh,   // thresh
+        uint8x8_t d3u8,      // p2
+        uint8x8_t d4u8,      // p2
+        uint8x8_t d5u8,      // p1
+        uint8x8_t d6u8,      // p0
+        uint8x8_t d7u8,      // q0
+        uint8x8_t d16u8,     // q1
+        uint8x8_t d17u8,     // q2
+        uint8x8_t d18u8,     // q3
+        uint8x8_t *d0ru8,    // p1
+        uint8x8_t *d1ru8,    // p1
+        uint8x8_t *d2ru8,    // p0
+        uint8x8_t *d3ru8,    // q0
+        uint8x8_t *d4ru8,    // q1
+        uint8x8_t *d5ru8) {  // q1
+    uint32_t flat;
+    uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
+    uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
+    int16x8_t q15s16;
+    uint16x8_t q10u16, q14u16;
+    int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;
+
+    d19u8 = vabd_u8(d3u8, d4u8);
+    d20u8 = vabd_u8(d4u8, d5u8);
+    d21u8 = vabd_u8(d5u8, d6u8);
+    d22u8 = vabd_u8(d16u8, d7u8);
+    d23u8 = vabd_u8(d17u8, d16u8);
+    d24u8 = vabd_u8(d18u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+    d20u8 = vmax_u8(d21u8, d22u8);
+
+    d25u8 = vabd_u8(d6u8, d4u8);
+
+    d23u8 = vmax_u8(d23u8, d24u8);
+
+    d26u8 = vabd_u8(d7u8, d17u8);
+
+    d19u8 = vmax_u8(d19u8, d20u8);
+
+    d24u8 = vabd_u8(d6u8, d7u8);
+    d27u8 = vabd_u8(d3u8, d6u8);
+    d28u8 = vabd_u8(d18u8, d7u8);
+
+    d19u8 = vmax_u8(d19u8, d23u8);
+
+    d23u8 = vabd_u8(d5u8, d16u8);
+    d24u8 = vqadd_u8(d24u8, d24u8);
+
+
+    d19u8 = vcge_u8(dlimit, d19u8);
+
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+    d26u8 = vmax_u8(d27u8, d28u8);
+
+    d23u8 = vshr_n_u8(d23u8, 1);
+
+    d25u8 = vmax_u8(d25u8, d26u8);
+
+    d24u8 = vqadd_u8(d24u8, d23u8);
+
+    d20u8 = vmax_u8(d20u8, d25u8);
+
+    d23u8 = vdup_n_u8(1);
+    d24u8 = vcge_u8(dblimit, d24u8);
+
+    d21u8 = vcgt_u8(d21u8, dthresh);
+
+    d20u8 = vcge_u8(d23u8, d20u8);
+
+    d19u8 = vand_u8(d19u8, d24u8);
+
+    d23u8 = vcgt_u8(d22u8, dthresh);
+
+    d20u8 = vand_u8(d20u8, d19u8);
+
+    d22u8 = vdup_n_u8(0x80);
+
+    d23u8 = vorr_u8(d21u8, d23u8);
+
+    q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8),
+                          vreinterpret_u16_u8(d21u8));
+
+    d30u8 = vshrn_n_u16(q10u16, 4);
+    flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);
+
+    if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
+        d27u8 = vdup_n_u8(3);
+        d21u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        *d0ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        *d1ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        *d2ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d3ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d4ru8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+        *d5ru8 = vqrshrn_n_u16(q14u16, 3);
+    } else {
+        d21u8 = veor_u8(d7u8,  d22u8);
+        d24u8 = veor_u8(d6u8,  d22u8);
+        d25u8 = veor_u8(d5u8,  d22u8);
+        d26u8 = veor_u8(d16u8, d22u8);
+
+        d27u8 = vdup_n_u8(3);
+
+        d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
+        d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));
+
+        q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));
+
+        d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        q15s16 = vaddw_s8(q15s16, d29s8);
+
+        d29u8 = vdup_n_u8(4);
+
+        d28s8 = vqmovn_s16(q15s16);
+
+        d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));
+
+        d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
+        d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
+        d30s8 = vshr_n_s8(d30s8, 3);
+        d29s8 = vshr_n_s8(d29s8, 3);
+
+        d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
+        d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);
+
+        d29s8 = vrshr_n_s8(d29s8, 1);
+        d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));
+
+        d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
+        d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);
+
+        if (flat == 0) {  // filter_branch_only
+            *d0ru8 = d4u8;
+            *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+            *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+            *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+            *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+            *d5ru8 = d17u8;
+            return;
+        }
+
+        d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
+        d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
+        d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
+        d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
+
+        d23u8 = vdup_n_u8(2);
+        q14u16 = vaddl_u8(d6u8, d7u8);
+        q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
+        q14u16 = vmlal_u8(q14u16, d4u8, d23u8);
+
+        d0u8 = vbsl_u8(d20u8, dblimit, d4u8);
+
+        q14u16 = vaddw_u8(q14u16, d5u8);
+
+        d1u8 = vbsl_u8(d20u8, dlimit, d25u8);
+
+        d30u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vaddw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d2u8 = vbsl_u8(d20u8, dthresh, d24u8);
+
+        d31u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vaddw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+
+        *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);
+
+        d23u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d3u8);
+        q14u16 = vsubw_u8(q14u16, d6u8);
+        q14u16 = vaddw_u8(q14u16, d7u8);
+
+        *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);
+
+        d22u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d4u8);
+        q14u16 = vsubw_u8(q14u16, d7u8);
+        q14u16 = vaddw_u8(q14u16, d16u8);
+
+        d3u8 = vbsl_u8(d20u8, d3u8, d21u8);
+
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d4u8 = vbsl_u8(d20u8, d4u8, d26u8);
+
+        d6u8 = vqrshrn_n_u16(q14u16, 3);
+
+        q14u16 = vsubw_u8(q14u16, d5u8);
+        q14u16 = vsubw_u8(q14u16, d16u8);
+        q14u16 = vaddw_u8(q14u16, d17u8);
+        q14u16 = vaddw_u8(q14u16, d18u8);
+
+        d5u8 = vbsl_u8(d20u8, d5u8, d17u8);
+
+        d7u8 = vqrshrn_n_u16(q14u16, 3);
+
+        *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
+        *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
+        *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
+    }
+    return;
+}
+
+void vp9_lpf_horizontal_8_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i;
+    uint8_t *s, *psrc;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+
+    if (count == 0)  // end_vp9_mblf_h_edge
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    psrc = src - (pitch << 2);
+    for (i = 0; i < count; i++) {
+        s = psrc + i * 8;
+
+        d3u8  = vld1_u8(s);
+        s += pitch;
+        d4u8  = vld1_u8(s);
+        s += pitch;
+        d5u8  = vld1_u8(s);
+        s += pitch;
+        d6u8  = vld1_u8(s);
+        s += pitch;
+        d7u8  = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        s -= (pitch * 6);
+        vst1_u8(s, d0u8);
+        s += pitch;
+        vst1_u8(s, d1u8);
+        s += pitch;
+        vst1_u8(s, d2u8);
+        s += pitch;
+        vst1_u8(s, d3u8);
+        s += pitch;
+        vst1_u8(s, d4u8);
+        s += pitch;
+        vst1_u8(s, d5u8);
+    }
+    return;
+}
+
+void vp9_lpf_vertical_8_neon(
+        unsigned char *src,
+        int pitch,
+        unsigned char *blimit,
+        unsigned char *limit,
+        unsigned char *thresh,
+        int count) {
+    int i;
+    uint8_t *s;
+    uint8x8_t dblimit, dlimit, dthresh;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    uint8x8_t d16u8, d17u8, d18u8;
+    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
+    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
+    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
+    uint8x8x4_t d4Result;
+    uint8x8x2_t d2Result;
+
+    if (count == 0)
+        return;
+
+    dblimit = vld1_u8(blimit);
+    dlimit = vld1_u8(limit);
+    dthresh = vld1_u8(thresh);
+
+    for (i = 0; i < count; i++) {
+        s = src + (i * (pitch << 3)) - 4;
+
+        d3u8 = vld1_u8(s);
+        s += pitch;
+        d4u8 = vld1_u8(s);
+        s += pitch;
+        d5u8 = vld1_u8(s);
+        s += pitch;
+        d6u8 = vld1_u8(s);
+        s += pitch;
+        d7u8 = vld1_u8(s);
+        s += pitch;
+        d16u8 = vld1_u8(s);
+        s += pitch;
+        d17u8 = vld1_u8(s);
+        s += pitch;
+        d18u8 = vld1_u8(s);
+
+        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
+                          vreinterpret_u32_u8(d7u8));
+        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
+                          vreinterpret_u32_u8(d16u8));
+        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
+                          vreinterpret_u32_u8(d17u8));
+        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
+                          vreinterpret_u32_u8(d18u8));
+
+        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
+                          vreinterpret_u16_u32(d2tmp2.val[0]));
+        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
+                          vreinterpret_u16_u32(d2tmp3.val[0]));
+        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
+                          vreinterpret_u16_u32(d2tmp2.val[1]));
+        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
+                          vreinterpret_u16_u32(d2tmp3.val[1]));
+
+        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
+                         vreinterpret_u8_u16(d2tmp5.val[0]));
+        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
+                         vreinterpret_u8_u16(d2tmp5.val[1]));
+        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
+                          vreinterpret_u8_u16(d2tmp7.val[0]));
+        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
+                          vreinterpret_u8_u16(d2tmp7.val[1]));
+
+        d3u8 = d2tmp8.val[0];
+        d4u8 = d2tmp8.val[1];
+        d5u8 = d2tmp9.val[0];
+        d6u8 = d2tmp9.val[1];
+        d7u8 = d2tmp10.val[0];
+        d16u8 = d2tmp10.val[1];
+        d17u8 = d2tmp11.val[0];
+        d18u8 = d2tmp11.val[1];
+
+        vp9_mbloop_filter_neon(dblimit, dlimit, dthresh,
+                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
+                             &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8);
+
+        d4Result.val[0] = d0u8;
+        d4Result.val[1] = d1u8;
+        d4Result.val[2] = d2u8;
+        d4Result.val[3] = d3u8;
+
+        d2Result.val[0] = d4u8;
+        d2Result.val[1] = d5u8;
+
+        s = src - 3;
+        vst4_lane_u8(s, d4Result, 0);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 1);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 2);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 3);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 4);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 5);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 6);
+        s += pitch;
+        vst4_lane_u8(s, d4Result, 7);
+
+        s = src + 1;
+        vst2_lane_u8(s, d2Result, 0);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 1);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 2);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 3);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 4);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 5);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 6);
+        s += pitch;
+        vst2_lane_u8(s, d2Result, 7);
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
index 4430322171d..91aaec04eaf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm
@@ -8,8 +8,6 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
-    EXPORT  |vp9_lpf_horizontal_4_neon|
-    EXPORT  |vp9_lpf_vertical_4_neon|
     EXPORT  |vp9_lpf_horizontal_8_neon|
     EXPORT  |vp9_lpf_vertical_8_neon|
     ARM
@@ -21,261 +19,6 @@
 ; TODO(fgalligan): See about removing the count code as this function is only
 ; called with a count of 1.
 ;
-; void vp9_lpf_horizontal_4_neon(uint8_t *s,
-;                                int p /* pitch */,
-;                                const uint8_t *blimit,
-;                                const uint8_t *limit,
-;                                const uint8_t *thresh,
-;                                int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_lpf_horizontal_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
-    ldr         r12, [sp, #8]              ; load count
-    ldr         r2, [sp, #4]               ; load thresh
-    add         r1, r1, r1                 ; double pitch
-
-    cmp         r12, #0
-    beq         end_vp9_lf_h_edge
-
-    vld1.8      {d1[]}, [r3]               ; duplicate *limit
-    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
-
-count_lf_h_loop
-    sub         r2, r0, r1, lsl #1         ; move src pointer down by 4 lines
-    add         r3, r2, r1, lsr #1         ; set to 3 lines down
-
-    vld1.u8     {d3}, [r2@64], r1          ; p3
-    vld1.u8     {d4}, [r3@64], r1          ; p2
-    vld1.u8     {d5}, [r2@64], r1          ; p1
-    vld1.u8     {d6}, [r3@64], r1          ; p0
-    vld1.u8     {d7}, [r2@64], r1          ; q0
-    vld1.u8     {d16}, [r3@64], r1         ; q1
-    vld1.u8     {d17}, [r2@64]             ; q2
-    vld1.u8     {d18}, [r3@64]             ; q3
-
-    sub         r2, r2, r1, lsl #1
-    sub         r3, r3, r1, lsl #1
-
-    bl          vp9_loop_filter_neon
-
-    vst1.u8     {d4}, [r2@64], r1          ; store op1
-    vst1.u8     {d5}, [r3@64], r1          ; store op0
-    vst1.u8     {d6}, [r2@64], r1          ; store oq0
-    vst1.u8     {d7}, [r3@64], r1          ; store oq1
-
-    add         r0, r0, #8
-    subs        r12, r12, #1
-    bne         count_lf_h_loop
-
-end_vp9_lf_h_edge
-    pop         {pc}
-    ENDP        ; |vp9_lpf_horizontal_4_neon|
-
-; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter
-; works on 16 iterations at a time.
-; TODO(fgalligan): See about removing the count code as this function is only
-; called with a count of 1.
-;
-; void vp9_lpf_vertical_4_neon(uint8_t *s,
-;                              int p /* pitch */,
-;                              const uint8_t *blimit,
-;                              const uint8_t *limit,
-;                              const uint8_t *thresh,
-;                              int count)
-;
-; r0    uint8_t *s,
-; r1    int p, /* pitch */
-; r2    const uint8_t *blimit,
-; r3    const uint8_t *limit,
-; sp    const uint8_t *thresh,
-; sp+4  int count
-|vp9_lpf_vertical_4_neon| PROC
-    push        {lr}
-
-    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
-    ldr         r12, [sp, #8]             ; load count
-    vld1.8      {d1[]}, [r3]              ; duplicate *limit
-
-    ldr         r3, [sp, #4]              ; load thresh
-    sub         r2, r0, #4                ; move s pointer down by 4 columns
-    cmp         r12, #0
-    beq         end_vp9_lf_v_edge
-
-    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
-
-count_lf_v_loop
-    vld1.u8     {d3}, [r2], r1             ; load s data
-    vld1.u8     {d4}, [r2], r1
-    vld1.u8     {d5}, [r2], r1
-    vld1.u8     {d6}, [r2], r1
-    vld1.u8     {d7}, [r2], r1
-    vld1.u8     {d16}, [r2], r1
-    vld1.u8     {d17}, [r2], r1
-    vld1.u8     {d18}, [r2]
-
-    ;transpose to 8x16 matrix
-    vtrn.32     d3, d7
-    vtrn.32     d4, d16
-    vtrn.32     d5, d17
-    vtrn.32     d6, d18
-
-    vtrn.16     d3, d5
-    vtrn.16     d4, d6
-    vtrn.16     d7, d17
-    vtrn.16     d16, d18
-
-    vtrn.8      d3, d4
-    vtrn.8      d5, d6
-    vtrn.8      d7, d16
-    vtrn.8      d17, d18
-
-    bl          vp9_loop_filter_neon
-
-    sub         r0, r0, #2
-
-    ;store op1, op0, oq0, oq1
-    vst4.8      {d4[0], d5[0], d6[0], d7[0]}, [r0], r1
-    vst4.8      {d4[1], d5[1], d6[1], d7[1]}, [r0], r1
-    vst4.8      {d4[2], d5[2], d6[2], d7[2]}, [r0], r1
-    vst4.8      {d4[3], d5[3], d6[3], d7[3]}, [r0], r1
-    vst4.8      {d4[4], d5[4], d6[4], d7[4]}, [r0], r1
-    vst4.8      {d4[5], d5[5], d6[5], d7[5]}, [r0], r1
-    vst4.8      {d4[6], d5[6], d6[6], d7[6]}, [r0], r1
-    vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
-
-    add         r0, r0, r1, lsl #3         ; s += pitch * 8
-    subs        r12, r12, #1
-    subne       r2, r0, #4                 ; move s pointer down by 4 columns
-    bne         count_lf_v_loop
-
-end_vp9_lf_v_edge
-    pop         {pc}
-    ENDP        ; |vp9_lpf_vertical_4_neon|
-
-; void vp9_loop_filter_neon();
-; This is a helper function for the loopfilters. The invidual functions do the
-; necessary load, transpose (if necessary) and store. The function does not use
-; registers d8-d15.
-;
-; Inputs:
-; r0-r3, r12 PRESERVE
-; d0    blimit
-; d1    limit
-; d2    thresh
-; d3    p3
-; d4    p2
-; d5    p1
-; d6    p0
-; d7    q0
-; d16   q1
-; d17   q2
-; d18   q3
-;
-; Outputs:
-; d4    op1
-; d5    op0
-; d6    oq0
-; d7    oq1
-|vp9_loop_filter_neon| PROC
-    ; filter_mask
-    vabd.u8     d19, d3, d4                 ; m1 = abs(p3 - p2)
-    vabd.u8     d20, d4, d5                 ; m2 = abs(p2 - p1)
-    vabd.u8     d21, d5, d6                 ; m3 = abs(p1 - p0)
-    vabd.u8     d22, d16, d7                ; m4 = abs(q1 - q0)
-    vabd.u8     d3, d17, d16                ; m5 = abs(q2 - q1)
-    vabd.u8     d4, d18, d17                ; m6 = abs(q3 - q2)
-
-    ; only compare the largest value to limit
-    vmax.u8     d19, d19, d20               ; m1 = max(m1, m2)
-    vmax.u8     d20, d21, d22               ; m2 = max(m3, m4)
-
-    vabd.u8     d17, d6, d7                 ; abs(p0 - q0)
-
-    vmax.u8     d3, d3, d4                  ; m3 = max(m5, m6)
-
-    vmov.u8     d18, #0x80
-
-    vmax.u8     d23, d19, d20               ; m1 = max(m1, m2)
-
-    ; hevmask
-    vcgt.u8     d21, d21, d2                ; (abs(p1 - p0) > thresh)*-1
-    vcgt.u8     d22, d22, d2                ; (abs(q1 - q0) > thresh)*-1
-    vmax.u8     d23, d23, d3                ; m1 = max(m1, m3)
-
-    vabd.u8     d28, d5, d16                ; a = abs(p1 - q1)
-    vqadd.u8    d17, d17, d17               ; b = abs(p0 - q0) * 2
-
-    veor        d7, d7, d18                 ; qs0
-
-    vcge.u8     d23, d1, d23                ; abs(m1) > limit
-
-    ; filter() function
-    ; convert to signed
-
-    vshr.u8     d28, d28, #1                ; a = a / 2
-    veor        d6, d6, d18                 ; ps0
-
-    veor        d5, d5, d18                 ; ps1
-    vqadd.u8    d17, d17, d28               ; a = b + a
-
-    veor        d16, d16, d18               ; qs1
-
-    vmov.u8     d19, #3
-
-    vsub.s8     d28, d7, d6                 ; ( qs0 - ps0)
-
-    vcge.u8     d17, d0, d17                ; a > blimit
-
-    vqsub.s8    d27, d5, d16                ; filter = clamp(ps1-qs1)
-    vorr        d22, d21, d22               ; hevmask
-
-    vmull.s8    q12, d28, d19               ; 3 * ( qs0 - ps0)
-
-    vand        d27, d27, d22               ; filter &= hev
-    vand        d23, d23, d17               ; filter_mask
-
-    vaddw.s8    q12, q12, d27               ; filter + 3 * (qs0 - ps0)
-
-    vmov.u8     d17, #4
-
-    ; filter = clamp(filter + 3 * ( qs0 - ps0))
-    vqmovn.s16  d27, q12
-
-    vand        d27, d27, d23               ; filter &= mask
-
-    vqadd.s8    d28, d27, d19               ; filter2 = clamp(filter+3)
-    vqadd.s8    d27, d27, d17               ; filter1 = clamp(filter+4)
-    vshr.s8     d28, d28, #3                ; filter2 >>= 3
-    vshr.s8     d27, d27, #3                ; filter1 >>= 3
-
-    vqadd.s8    d19, d6, d28                ; u = clamp(ps0 + filter2)
-    vqsub.s8    d26, d7, d27                ; u = clamp(qs0 - filter1)
-
-    ; outer tap adjustments
-    vrshr.s8    d27, d27, #1                ; filter = ++filter1 >> 1
-
-    veor        d6, d26, d18                ; *oq0 = u^0x80
-
-    vbic        d27, d27, d22               ; filter &= ~hev
-
-    vqadd.s8    d21, d5, d27                ; u = clamp(ps1 + filter)
-    vqsub.s8    d20, d16, d27               ; u = clamp(qs1 - filter)
-
-    veor        d5, d19, d18                ; *op0 = u^0x80
-    veor        d4, d21, d18                ; *op1 = u^0x80
-    veor        d7, d20, d18                ; *oq1 = u^0x80
-
-    bx          lr
-    ENDP        ; |vp9_loop_filter_neon|
-
 ; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p,
 ;                                const uint8_t *blimit,
 ;                                const uint8_t *limit,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c
new file mode 100644
index 00000000000..31fcc63ba06
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_neon.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+#if HAVE_NEON_ASM
+void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */,
+                                    const uint8_t *blimit0,
+                                    const uint8_t *limit0,
+                                    const uint8_t *thresh0,
+                                    const uint8_t *blimit1,
+                                    const uint8_t *limit1,
+                                    const uint8_t *thresh1) {
+  vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p,
+                                  const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0,
+                                  const uint8_t *blimit1,
+                                  const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh);
+  vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // HAVE_NEON_ASM
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c
new file mode 100644
index 00000000000..d0beaa7208f
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -0,0 +1,473 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stddef.h>
+#include <arm_neon.h>
+
+void vp9_v_predictor_4x4_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint32x2_t d0u32 = vdup_n_u32(0);
+    (void)left;
+
+    d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+    for (i = 0; i < 4; i++, dst += y_stride)
+        vst1_lane_u32((uint32_t *)dst, d0u32, 0);
+    return;
+}
+
+void vp9_v_predictor_8x8_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    (void)left;
+
+    d0u8 = vld1_u8(above);
+    for (i = 0; i < 8; i++, dst += y_stride)
+        vst1_u8(dst, d0u8);
+    return;
+}
+
+void vp9_v_predictor_16x16_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    (void)left;
+
+    q0u8 = vld1q_u8(above);
+    for (i = 0; i < 16; i++, dst += y_stride)
+        vst1q_u8(dst, q0u8);
+    return;
+}
+
+void vp9_v_predictor_32x32_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    uint8x16_t q1u8 = vdupq_n_u8(0);
+    (void)left;
+
+    q0u8 = vld1q_u8(above);
+    q1u8 = vld1q_u8(above + 16);
+    for (i = 0; i < 32; i++, dst += y_stride) {
+        vst1q_u8(dst, q0u8);
+        vst1q_u8(dst + 16, q1u8);
+    }
+    return;
+}
+
+void vp9_h_predictor_4x4_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    uint32x2_t d1u32 = vdup_n_u32(0);
+    (void)above;
+
+    d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    return;
+}
+
+void vp9_h_predictor_8x8_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    uint64x1_t d1u64 = vdup_n_u64(0);
+    (void)above;
+
+    d1u64 = vld1_u64((const uint64_t *)left);
+
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+    vst1_u8(dst, d0u8);
+    dst += y_stride;
+    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+    vst1_u8(dst, d0u8);
+    return;
+}
+
+void vp9_h_predictor_16x16_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j;
+    uint8x8_t d2u8 = vdup_n_u8(0);
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    uint8x16_t q1u8 = vdupq_n_u8(0);
+    (void)above;
+
+    q1u8 = vld1q_u8(left);
+    d2u8 = vget_low_u8(q1u8);
+    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+        q0u8 = vdupq_lane_u8(d2u8, 0);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 1);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 2);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 3);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 4);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 5);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 6);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+        q0u8 = vdupq_lane_u8(d2u8, 7);
+        vst1q_u8(dst, q0u8);
+        dst += y_stride;
+    }
+    return;
+}
+
+void vp9_h_predictor_32x32_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j, k;
+    uint8x8_t d2u8 = vdup_n_u8(0);
+    uint8x16_t q0u8 = vdupq_n_u8(0);
+    uint8x16_t q1u8 = vdupq_n_u8(0);
+    (void)above;
+
+    for (k = 0; k < 2; k++, left += 16) {
+        q1u8 = vld1q_u8(left);
+        d2u8 = vget_low_u8(q1u8);
+        for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+            q0u8 = vdupq_lane_u8(d2u8, 0);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 1);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 2);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 3);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 4);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 5);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 6);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+            q0u8 = vdupq_lane_u8(d2u8, 7);
+            vst1q_u8(dst, q0u8);
+            vst1q_u8(dst + 16, q0u8);
+            dst += y_stride;
+        }
+    }
+    return;
+}
+
+void vp9_tm_predictor_4x4_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int i;
+    uint16x8_t q1u16, q3u16;
+    int16x8_t q1s16;
+    uint8x8_t d0u8 = vdup_n_u8(0);
+    uint32x2_t d2u32 = vdup_n_u32(0);
+
+    d0u8 = vdup_n_u8(above[-1]);
+    d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+    q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+    for (i = 0; i < 4; i++, dst += y_stride) {
+        q1u16 = vdupq_n_u16((uint16_t)left[i]);
+        q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+                          vreinterpretq_s16_u16(q3u16));
+        d0u8 = vqmovun_s16(q1s16);
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+    }
+    return;
+}
+
+void vp9_tm_predictor_8x8_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j;
+    uint16x8_t q0u16, q3u16, q10u16;
+    int16x8_t q0s16;
+    uint16x4_t d20u16;
+    uint8x8_t d0u8, d2u8, d30u8;
+
+    d0u8 = vdup_n_u8(above[-1]);
+    d30u8 = vld1_u8(left);
+    d2u8 = vld1_u8(above);
+    q10u16 = vmovl_u8(d30u8);
+    q3u16 = vsubl_u8(d2u8, d0u8);
+    d20u16 = vget_low_u16(q10u16);
+    for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+        q0u16 = vdupq_lane_u16(d20u16, 0);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+        q0u16 = vdupq_lane_u16(d20u16, 1);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+        q0u16 = vdupq_lane_u16(d20u16, 2);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+        q0u16 = vdupq_lane_u16(d20u16, 3);
+        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                          vreinterpretq_s16_u16(q0u16));
+        d0u8 = vqmovun_s16(q0s16);
+        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+        dst += y_stride;
+    }
+    return;
+}
+
+void vp9_tm_predictor_16x16_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j, k;
+    uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+    uint8x16_t q0u8, q1u8;
+    int16x8_t q0s16, q1s16, q8s16, q11s16;
+    uint16x4_t d20u16;
+    uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+
+    q0u8 = vdupq_n_u8(above[-1]);
+    q1u8 = vld1q_u8(above);
+    q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+    q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+    for (k = 0; k < 2; k++, left += 8) {
+        d18u8 = vld1_u8(left);
+        q10u16 = vmovl_u8(d18u8);
+        d20u16 = vget_low_u16(q10u16);
+        for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+            q0u16 = vdupq_lane_u16(d20u16, 0);
+            q8u16 = vdupq_lane_u16(d20u16, 1);
+            q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q3u16));
+            q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q3u16));
+            d2u8 = vqmovun_s16(q1s16);
+            d3u8 = vqmovun_s16(q0s16);
+            d22u8 = vqmovun_s16(q11s16);
+            d23u8 = vqmovun_s16(q8s16);
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+            dst += y_stride;
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d20u16, 2);
+            q8u16 = vdupq_lane_u16(d20u16, 3);
+            q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                              vreinterpretq_s16_u16(q3u16));
+            q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q2u16));
+            q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                              vreinterpretq_s16_u16(q3u16));
+            d2u8 = vqmovun_s16(q1s16);
+            d3u8 = vqmovun_s16(q0s16);
+            d22u8 = vqmovun_s16(q11s16);
+            d23u8 = vqmovun_s16(q8s16);
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+            dst += y_stride;
+            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+            dst += y_stride;
+        }
+    }
+    return;
+}
+
+void vp9_tm_predictor_32x32_neon(
+        uint8_t *dst,
+        ptrdiff_t y_stride,
+        const uint8_t *above,
+        const uint8_t *left) {
+    int j, k;
+    uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+    uint8x16_t q0u8, q1u8, q2u8;
+    int16x8_t q12s16, q13s16, q14s16, q15s16;
+    uint16x4_t d6u16;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+
+    q0u8 = vdupq_n_u8(above[-1]);
+    q1u8 = vld1q_u8(above);
+    q2u8 = vld1q_u8(above + 16);
+    q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+    q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+    q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+    q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+    for (k = 0; k < 4; k++, left += 8) {
+        d26u8 = vld1_u8(left);
+        q3u16 = vmovl_u8(d26u8);
+        d6u16 = vget_low_u16(q3u16);
+        for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+            q0u16 = vdupq_lane_u16(d6u16, 0);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d6u16, 1);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d6u16, 2);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+
+            q0u16 = vdupq_lane_u16(d6u16, 3);
+            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q8u16));
+            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q9u16));
+            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q10u16));
+            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                               vreinterpretq_s16_u16(q11u16));
+            d0u8 = vqmovun_s16(q12s16);
+            d1u8 = vqmovun_s16(q13s16);
+            d2u8 = vqmovun_s16(q14s16);
+            d3u8 = vqmovun_s16(q15s16);
+            q0u8 = vcombine_u8(d0u8, d1u8);
+            q1u8 = vcombine_u8(d2u8, d3u8);
+            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+            dst += y_stride;
+        }
+    }
+    return;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
index dc9856fa887..dc9856fa887 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
index ab18490dce3..17422798c13 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c
@@ -413,7 +413,7 @@ void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
 
   assert(w <= 64);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
index 0ef9dd508e5..58b50d2df93 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c
@@ -950,7 +950,7 @@ void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
+  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
   uint32_t pos = 38;
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
new file mode 100644
index 00000000000..e2247435e88
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_horiz_msa.c
@@ -0,0 +1,1045 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+
+static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
+
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+
+  out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+  out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+
+  PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
+}
+
+static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
+
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1);
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out2, out3);
+
+  out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+  out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+  out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+  out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+  PCKEV_2B_XORI128_STORE_4_BYTES_4(out0, out1, dst, dst_stride);
+  dst += (4 * dst_stride);
+  PCKEV_2B_XORI128_STORE_4_BYTES_4(out2, out3, dst, dst_stride);
+}
+
+static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+  XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filt0, filt1, filt2, filt3, out0, out1, out2,
+                             out3);
+
+  out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+  out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+  out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+  out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+  PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src0, src1, src2, src3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+
+    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+    PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LOAD_SB(src);
+    src1 = LOAD_SB(src + 8);
+    src += src_stride;
+    src2 = LOAD_SB(src);
+    src3 = LOAD_SB(src + 8);
+    src += src_stride;
+
+    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+
+    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+    PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
+    dst += dst_stride;
+    PCKEV_B_XORI128_STORE_VEC(out3, out2, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    src0 = LOAD_SB(src);
+    src2 = LOAD_SB(src + 16);
+    src3 = LOAD_SB(src + 24);
+    src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+    src += src_stride;
+
+    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+
+    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+    src0 = LOAD_SB(src);
+    src2 = LOAD_SB(src + 16);
+    src3 = LOAD_SB(src + 24);
+    src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+
+    PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
+    PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
+    dst += dst_stride;
+
+    XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filt0, filt1, filt2, filt3, out0, out1,
+                               out2, out3);
+
+    out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+    out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+    out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+    out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+    PCKEV_B_XORI128_STORE_VEC(out1, out0, dst);
+    PCKEV_B_XORI128_STORE_VEC(out3, out2, (dst + 16));
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt, out0, out1, out2, out3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+  src -= 3;
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  for (loop_cnt = height; loop_cnt--;) {
+    for (cnt = 0; cnt < 2; ++cnt) {
+      src0 = LOAD_SB(&src[cnt << 5]);
+      src2 = LOAD_SB(&src[16 + (cnt << 5)]);
+      src3 = LOAD_SB(&src[24 + (cnt << 5)]);
+      src1 = __msa_sld_b((v16i8)src2, (v16i8)src0, 8);
+
+      XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
+
+      HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                                 mask3, filt0, filt1, filt2, filt3, out0, out1,
+                                 out2, out3);
+
+      out0 = SRARI_SATURATE_SIGNED_H(out0, FILTER_BITS, 7);
+      out1 = SRARI_SATURATE_SIGNED_H(out1, FILTER_BITS, 7);
+      out2 = SRARI_SATURATE_SIGNED_H(out2, FILTER_BITS, 7);
+      out3 = SRARI_SATURATE_SIGNED_H(out3, FILTER_BITS, 7);
+
+      PCKEV_B_XORI128_STORE_VEC(out1, out0, &dst[cnt << 5]);
+      PCKEV_B_XORI128_STORE_VEC(out3, out2, &dst[16 + (cnt << 5)]);
+    }
+
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  uint32_t out0, out1, out2, out3;
+  v16i8 src0, src1, src2, src3, mask;
+  v16u8 vec0, vec1, filt0;
+  v16i8 res0, res1;
+  v8u16 vec2, vec3, filt, const255;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+  vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
+  vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
+
+  vec2 = __msa_dotp_u_h(vec0, filt0);
+  vec3 = __msa_dotp_u_h(vec1, filt0);
+
+  vec2 = (v8u16)__msa_srari_h((v8i16)vec2, FILTER_BITS);
+  vec3 = (v8u16)__msa_srari_h((v8i16)vec3, FILTER_BITS);
+
+  vec2 = __msa_min_u_h(vec2, const255);
+  vec3 = __msa_min_u_h(vec3, const255);
+
+  res0 = __msa_pckev_b((v16i8)vec2, (v16i8)vec2);
+  res1 = __msa_pckev_b((v16i8)vec3, (v16i8)vec3);
+
+  out0 = __msa_copy_u_w((v4i32)res0, 0);
+  out1 = __msa_copy_u_w((v4i32)res0, 1);
+  out2 = __msa_copy_u_w((v4i32)res1, 0);
+  out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+}
+
+static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  uint32_t out0, out1, out2, out3;
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 vec0, vec1, vec2, vec3;
+  v8u16 vec4, vec5, vec6, vec7;
+  v16i8 res0, res1, res2, res3;
+  v8u16 filt, const255;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  LOAD_8VECS_SB(src, src_stride,
+                src0, src1, src2, src3, src4, src5, src6, src7);
+
+  vec0 = (v16u8)__msa_vshf_b(mask, src1, src0);
+  vec1 = (v16u8)__msa_vshf_b(mask, src3, src2);
+  vec2 = (v16u8)__msa_vshf_b(mask, src5, src4);
+  vec3 = (v16u8)__msa_vshf_b(mask, src7, src6);
+
+  vec4 = __msa_dotp_u_h(vec0, filt0);
+  vec5 = __msa_dotp_u_h(vec1, filt0);
+  vec6 = __msa_dotp_u_h(vec2, filt0);
+  vec7 = __msa_dotp_u_h(vec3, filt0);
+
+  vec4 = (v8u16)__msa_srari_h((v8i16)vec4, FILTER_BITS);
+  vec5 = (v8u16)__msa_srari_h((v8i16)vec5, FILTER_BITS);
+  vec6 = (v8u16)__msa_srari_h((v8i16)vec6, FILTER_BITS);
+  vec7 = (v8u16)__msa_srari_h((v8i16)vec7, FILTER_BITS);
+
+  vec4 = __msa_min_u_h(vec4, const255);
+  vec5 = __msa_min_u_h(vec5, const255);
+  vec6 = __msa_min_u_h(vec6, const255);
+  vec7 = __msa_min_u_h(vec7, const255);
+
+  res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
+  res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
+  res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
+  res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
+
+  out0 = __msa_copy_u_w((v4i32)res0, 0);
+  out1 = __msa_copy_u_w((v4i32)res0, 1);
+  out2 = __msa_copy_u_w((v4i32)res1, 0);
+  out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+  dst += dst_stride;
+
+  out0 = __msa_copy_u_w((v4i32)res2, 0);
+  out1 = __msa_copy_u_w((v4i32)res2, 1);
+  out2 = __msa_copy_u_w((v4i32)res3, 0);
+  out3 = __msa_copy_u_w((v4i32)res3, 1);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+}
+
+static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 out0, out1, out2, out3;
+  v8u16 const255, filt;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+
+  vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+  vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+  vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+  vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+  vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+  vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+  vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+  vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+  SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
+
+  out0 = __msa_min_u_h(vec0, const255);
+  out1 = __msa_min_u_h(vec1, const255);
+  out2 = __msa_min_u_h(vec2, const255);
+  out3 = __msa_min_u_h(vec3, const255);
+
+  PCKEV_B_STORE_8_BYTES_4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  v16u8 filt0;
+  v16i8 src0, src1, src2, src3, mask;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 filt, const255;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+  vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+  vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+  vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+  vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+  vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+  vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+  vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+  SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
+
+  vec0 = __msa_min_u_h(vec0, const255);
+  vec1 = __msa_min_u_h(vec1, const255);
+  vec2 = __msa_min_u_h(vec2, const255);
+  vec3 = __msa_min_u_h(vec3, const255);
+
+  LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+  src += (4 * src_stride);
+
+  PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+  vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+  vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+  vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+  vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+  vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+  vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+  vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+  SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3, FILTER_BITS);
+
+  vec0 = __msa_min_u_h(vec0, const255);
+  vec1 = __msa_min_u_h(vec1, const255);
+  vec2 = __msa_min_u_h(vec2, const255);
+  vec3 = __msa_min_u_h(vec3, const255);
+
+  PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+  dst += (4 * dst_stride);
+
+  if (16 == height) {
+    LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+    vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+    vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+    vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+    vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+    vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+    vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+    SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
+                     vec0, vec1, vec2, vec3, FILTER_BITS);
+
+    vec0 = __msa_min_u_h(vec0, const255);
+    vec1 = __msa_min_u_h(vec1, const255);
+    vec2 = __msa_min_u_h(vec2, const255);
+    vec3 = __msa_min_u_h(vec3, const255);
+
+    LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    vec0 = (v8u16)__msa_vshf_b(mask, src0, src0);
+    vec1 = (v8u16)__msa_vshf_b(mask, src1, src1);
+    vec2 = (v8u16)__msa_vshf_b(mask, src2, src2);
+    vec3 = (v8u16)__msa_vshf_b(mask, src3, src3);
+
+    vec0 = __msa_dotp_u_h((v16u8)vec0, filt0);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, filt0);
+    vec2 = __msa_dotp_u_h((v16u8)vec2, filt0);
+    vec3 = __msa_dotp_u_h((v16u8)vec3, filt0);
+
+    SRARI_H_4VECS_UH(vec0, vec1, vec2, vec3,
+                     vec0, vec1, vec2, vec3, FILTER_BITS);
+
+    vec0 = __msa_min_u_h(vec0, const255);
+    vec1 = __msa_min_u_h(vec1, const255);
+    vec2 = __msa_min_u_h(vec2, const255);
+    vec3 = __msa_min_u_h(vec3, const255);
+
+    PCKEV_B_STORE_8_BYTES_4(vec0, vec1, vec2, vec3, dst, dst_stride);
+  }
+}
+
+static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8u16 filt, const255;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  loop_cnt = (height >> 2) - 1;
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  src0 = LOAD_SB(src);
+  src1 = LOAD_SB(src + 8);
+  src += src_stride;
+  src2 = LOAD_SB(src);
+  src3 = LOAD_SB(src + 8);
+  src += src_stride;
+  src4 = LOAD_SB(src);
+  src5 = LOAD_SB(src + 8);
+  src += src_stride;
+  src6 = LOAD_SB(src);
+  src7 = LOAD_SB(src + 8);
+  src += src_stride;
+
+  vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+  vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+  vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+  vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+  vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+  vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+  vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+  vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+  out0 = __msa_dotp_u_h(vec0, filt0);
+  out1 = __msa_dotp_u_h(vec1, filt0);
+  out2 = __msa_dotp_u_h(vec2, filt0);
+  out3 = __msa_dotp_u_h(vec3, filt0);
+  out4 = __msa_dotp_u_h(vec4, filt0);
+  out5 = __msa_dotp_u_h(vec5, filt0);
+  out6 = __msa_dotp_u_h(vec6, filt0);
+  out7 = __msa_dotp_u_h(vec7, filt0);
+
+  out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+  out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+  out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+  out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+  out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+  out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+  out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+  out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+  out0 = __msa_min_u_h(out0, const255);
+  out1 = __msa_min_u_h(out1, const255);
+  out2 = __msa_min_u_h(out2, const255);
+  out3 = __msa_min_u_h(out3, const255);
+  out4 = __msa_min_u_h(out4, const255);
+  out5 = __msa_min_u_h(out5, const255);
+  out6 = __msa_min_u_h(out6, const255);
+  out7 = __msa_min_u_h(out7, const255);
+
+  PCKEV_B_STORE_VEC(out1, out0, dst);
+  dst += dst_stride;
+  PCKEV_B_STORE_VEC(out3, out2, dst);
+  dst += dst_stride;
+  PCKEV_B_STORE_VEC(out5, out4, dst);
+  dst += dst_stride;
+  PCKEV_B_STORE_VEC(out7, out6, dst);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src0 = LOAD_SB(src);
+    src1 = LOAD_SB(src + 8);
+    src += src_stride;
+    src2 = LOAD_SB(src);
+    src3 = LOAD_SB(src + 8);
+    src += src_stride;
+    src4 = LOAD_SB(src);
+    src5 = LOAD_SB(src + 8);
+    src += src_stride;
+    src6 = LOAD_SB(src);
+    src7 = LOAD_SB(src + 8);
+    src += src_stride;
+
+    vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+    vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+    vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+    vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+    vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+    vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+    vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+    vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+    out0 = __msa_dotp_u_h(vec0, filt0);
+    out1 = __msa_dotp_u_h(vec1, filt0);
+    out2 = __msa_dotp_u_h(vec2, filt0);
+    out3 = __msa_dotp_u_h(vec3, filt0);
+    out4 = __msa_dotp_u_h(vec4, filt0);
+    out5 = __msa_dotp_u_h(vec5, filt0);
+    out6 = __msa_dotp_u_h(vec6, filt0);
+    out7 = __msa_dotp_u_h(vec7, filt0);
+
+    out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+    out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+    out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+    out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+    out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+    out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+    out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+    out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+    out0 = __msa_min_u_h(out0, const255);
+    out1 = __msa_min_u_h(out1, const255);
+    out2 = __msa_min_u_h(out2, const255);
+    out3 = __msa_min_u_h(out3, const255);
+    out4 = __msa_min_u_h(out4, const255);
+    out5 = __msa_min_u_h(out5, const255);
+    out6 = __msa_min_u_h(out6, const255);
+    out7 = __msa_min_u_h(out7, const255);
+
+    PCKEV_B_STORE_VEC(out1, out0, dst);
+    dst += dst_stride;
+    PCKEV_B_STORE_VEC(out3, out2, dst);
+    dst += dst_stride;
+    PCKEV_B_STORE_VEC(out5, out4, dst);
+    dst += dst_stride;
+    PCKEV_B_STORE_VEC(out7, out6, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8u16 filt, const255;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = height >> 1; loop_cnt--;) {
+    src0 = LOAD_SB(src);
+    src2 = LOAD_SB(src + 16);
+    src3 = LOAD_SB(src + 24);
+    src1 = __msa_sld_b(src2, src0, 8);
+    src += src_stride;
+    src4 = LOAD_SB(src);
+    src6 = LOAD_SB(src + 16);
+    src7 = LOAD_SB(src + 24);
+    src5 = __msa_sld_b(src6, src4, 8);
+    src += src_stride;
+
+    vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+    vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+    vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+    vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+    vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+    vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+    vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+    vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+    out0 = __msa_dotp_u_h(vec0, filt0);
+    out1 = __msa_dotp_u_h(vec1, filt0);
+    out2 = __msa_dotp_u_h(vec2, filt0);
+    out3 = __msa_dotp_u_h(vec3, filt0);
+    out4 = __msa_dotp_u_h(vec4, filt0);
+    out5 = __msa_dotp_u_h(vec5, filt0);
+    out6 = __msa_dotp_u_h(vec6, filt0);
+    out7 = __msa_dotp_u_h(vec7, filt0);
+
+    out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+    out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+    out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+    out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+    out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+    out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+    out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+    out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+    out0 = __msa_min_u_h(out0, const255);
+    out1 = __msa_min_u_h(out1, const255);
+    out2 = __msa_min_u_h(out2, const255);
+    out3 = __msa_min_u_h(out3, const255);
+    out4 = __msa_min_u_h(out4, const255);
+    out5 = __msa_min_u_h(out5, const255);
+    out6 = __msa_min_u_h(out6, const255);
+    out7 = __msa_min_u_h(out7, const255);
+
+    PCKEV_B_STORE_VEC(out1, out0, dst);
+    PCKEV_B_STORE_VEC(out3, out2, dst + 16);
+    dst += dst_stride;
+    PCKEV_B_STORE_VEC(out5, out4, dst);
+    PCKEV_B_STORE_VEC(out7, out6, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8u16 filt, const255;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = height; loop_cnt--;) {
+    src0 = LOAD_SB(src);
+    src2 = LOAD_SB(src + 16);
+    src4 = LOAD_SB(src + 32);
+    src6 = LOAD_SB(src + 48);
+    src7 = LOAD_SB(src + 56);
+    src1 = __msa_sld_b(src2, src0, 8);
+    src3 = __msa_sld_b(src4, src2, 8);
+    src5 = __msa_sld_b(src6, src4, 8);
+    src += src_stride;
+
+    vec0 = (v16u8)__msa_vshf_b(mask, src0, src0);
+    vec1 = (v16u8)__msa_vshf_b(mask, src1, src1);
+    vec2 = (v16u8)__msa_vshf_b(mask, src2, src2);
+    vec3 = (v16u8)__msa_vshf_b(mask, src3, src3);
+    vec4 = (v16u8)__msa_vshf_b(mask, src4, src4);
+    vec5 = (v16u8)__msa_vshf_b(mask, src5, src5);
+    vec6 = (v16u8)__msa_vshf_b(mask, src6, src6);
+    vec7 = (v16u8)__msa_vshf_b(mask, src7, src7);
+
+    out0 = __msa_dotp_u_h(vec0, filt0);
+    out1 = __msa_dotp_u_h(vec1, filt0);
+    out2 = __msa_dotp_u_h(vec2, filt0);
+    out3 = __msa_dotp_u_h(vec3, filt0);
+    out4 = __msa_dotp_u_h(vec4, filt0);
+    out5 = __msa_dotp_u_h(vec5, filt0);
+    out6 = __msa_dotp_u_h(vec6, filt0);
+    out7 = __msa_dotp_u_h(vec7, filt0);
+
+    out0 = (v8u16)__msa_srari_h((v8i16)out0, FILTER_BITS);
+    out1 = (v8u16)__msa_srari_h((v8i16)out1, FILTER_BITS);
+    out2 = (v8u16)__msa_srari_h((v8i16)out2, FILTER_BITS);
+    out3 = (v8u16)__msa_srari_h((v8i16)out3, FILTER_BITS);
+    out4 = (v8u16)__msa_srari_h((v8i16)out4, FILTER_BITS);
+    out5 = (v8u16)__msa_srari_h((v8i16)out5, FILTER_BITS);
+    out6 = (v8u16)__msa_srari_h((v8i16)out6, FILTER_BITS);
+    out7 = (v8u16)__msa_srari_h((v8i16)out7, FILTER_BITS);
+
+    out0 = __msa_min_u_h(out0, const255);
+    out1 = __msa_min_u_h(out1, const255);
+    out2 = __msa_min_u_h(out2, const255);
+    out3 = __msa_min_u_h(out3, const255);
+    out4 = __msa_min_u_h(out4, const255);
+    out5 = __msa_min_u_h(out5, const255);
+    out6 = __msa_min_u_h(out6, const255);
+    out7 = __msa_min_u_h(out7, const255);
+
+    PCKEV_B_STORE_VEC(out1, out0, dst);
+    PCKEV_B_STORE_VEC(out3, out2, dst + 16);
+    PCKEV_B_STORE_VEC(out5, out4, dst + 32);
+    PCKEV_B_STORE_VEC(out7, out6, dst + 48);
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4,
+                             int w, int h) {
+  int8_t cnt, filt_hor[8];
+
+  if (16 != x_step_q4) {
+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                          filter_x, x_step_q4, filter_y, y_step_q4,
+                          w, h);
+    return;
+  }
+
+  if (((const int32_t *)filter_x)[1] == 0x800000) {
+    vp9_convolve_copy(src, src_stride, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h);
+    return;
+  }
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+                              filter_x, x_step_q4, filter_y, y_step_q4,
+                              w, h);
+        break;
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c
new file mode 100644
index 00000000000..d0c374648c9
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_msa.c
@@ -0,0 +1,880 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+
+const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 filt_horiz;
+  v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
+  v8i16 horiz_out5, horiz_out6, horiz_out7, horiz_out8, horiz_out9;
+  v8i16 tmp0, tmp1, out0, out1, out2, out3, out4;
+  v8i16 filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[16]);
+
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt_horiz = LOAD_SH(filter_horiz);
+  filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
+  filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
+  filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
+  filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+                  src0, src1, src2, src3, src4, src5, src6, 128);
+
+  horiz_out0 = HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3,
+                                     filt_horiz0, filt_horiz1, filt_horiz2,
+                                     filt_horiz3);
+  horiz_out2 = HORIZ_8TAP_FILT_2VECS(src2, src3, mask0, mask1, mask2, mask3,
+                                     filt_horiz0, filt_horiz1, filt_horiz2,
+                                     filt_horiz3);
+  horiz_out4 = HORIZ_8TAP_FILT_2VECS(src4, src5, mask0, mask1, mask2, mask3,
+                                     filt_horiz0, filt_horiz1, filt_horiz2,
+                                     filt_horiz3);
+  horiz_out5 = HORIZ_8TAP_FILT_2VECS(src5, src6, mask0, mask1, mask2, mask3,
+                                     filt_horiz0, filt_horiz1, filt_horiz2,
+                                     filt_horiz3);
+  horiz_out1 = (v8i16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
+  horiz_out3 = (v8i16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
+
+  filt = LOAD_SH(filter_vert);
+  filt_vert0 = __msa_splati_h(filt, 0);
+  filt_vert1 = __msa_splati_h(filt, 1);
+  filt_vert2 = __msa_splati_h(filt, 2);
+  filt_vert3 = __msa_splati_h(filt, 3);
+
+  out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+  out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+  out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+    horiz_out7 = HORIZ_8TAP_FILT_2VECS(src7, src8, mask0, mask1, mask2, mask3,
+                                       filt_horiz0, filt_horiz1, filt_horiz2,
+                                       filt_horiz3);
+    horiz_out6 = (v8i16)__msa_sldi_b((v16i8)horiz_out7, (v16i8)horiz_out5, 8);
+
+    out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
+
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
+                               filt_vert2, filt_vert3);
+
+    horiz_out9 = HORIZ_8TAP_FILT_2VECS(src9, src10, mask0, mask1, mask2, mask3,
+                                       filt_horiz0, filt_horiz1, filt_horiz2,
+                                       filt_horiz3);
+    horiz_out8 = (v8i16)__msa_sldi_b((v16i8)horiz_out9, (v16i8)horiz_out7, 8);
+
+    out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
+
+    tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vert0, filt_vert1,
+                               filt_vert2, filt_vert3);
+    tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_2B_XORI128_STORE_4_BYTES_4(tmp0, tmp1, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    horiz_out5 = horiz_out9;
+
+    out0 = out2;
+    out1 = out3;
+    out2 = out4;
+  }
+}
+
+static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt_horiz0, filt_horiz1, filt_horiz2, filt_horiz3;
+  v8i16 filt_horiz, filt, filt_vert0, filt_vert1, filt_vert2, filt_vert3;
+  v16u8 mask0, mask1, mask2, mask3;
+  v8i16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
+  v8i16 horiz_out4, horiz_out5, horiz_out6, horiz_out7;
+  v8i16 horiz_out8, horiz_out9, horiz_out10;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+
+  mask0 = LOAD_UB(&mc_filt_mask_arr[0]);
+
+  src -= (3 + 3 * src_stride);
+
+  /* rearranging filter */
+  filt_horiz = LOAD_SH(filter_horiz);
+  filt_horiz0 = (v16i8)__msa_splati_h(filt_horiz, 0);
+  filt_horiz1 = (v16i8)__msa_splati_h(filt_horiz, 1);
+  filt_horiz2 = (v16i8)__msa_splati_h(filt_horiz, 2);
+  filt_horiz3 = (v16i8)__msa_splati_h(filt_horiz, 3);
+
+  mask1 = mask0 + 2;
+  mask2 = mask0 + 4;
+  mask3 = mask0 + 6;
+
+  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+                  src0, src1, src2, src3, src4, src5, src6, 128);
+
+  horiz_out0 = HORIZ_8TAP_FILT(src0, mask0, mask1, mask2, mask3, filt_horiz0,
+                               filt_horiz1, filt_horiz2, filt_horiz3);
+  horiz_out1 = HORIZ_8TAP_FILT(src1, mask0, mask1, mask2, mask3, filt_horiz0,
+                               filt_horiz1, filt_horiz2, filt_horiz3);
+  horiz_out2 = HORIZ_8TAP_FILT(src2, mask0, mask1, mask2, mask3, filt_horiz0,
+                               filt_horiz1, filt_horiz2, filt_horiz3);
+  horiz_out3 = HORIZ_8TAP_FILT(src3, mask0, mask1, mask2, mask3, filt_horiz0,
+                               filt_horiz1, filt_horiz2, filt_horiz3);
+  horiz_out4 = HORIZ_8TAP_FILT(src4, mask0, mask1, mask2, mask3, filt_horiz0,
+                               filt_horiz1, filt_horiz2, filt_horiz3);
+  horiz_out5 = HORIZ_8TAP_FILT(src5, mask0, mask1, mask2, mask3, filt_horiz0,
+                               filt_horiz1, filt_horiz2, filt_horiz3);
+  horiz_out6 = HORIZ_8TAP_FILT(src6, mask0, mask1, mask2, mask3, filt_horiz0,
+                               filt_horiz1, filt_horiz2, filt_horiz3);
+
+  filt = LOAD_SH(filter_vert);
+  filt_vert0 = __msa_splati_h(filt, 0);
+  filt_vert1 = __msa_splati_h(filt, 1);
+  filt_vert2 = __msa_splati_h(filt, 2);
+  filt_vert3 = __msa_splati_h(filt, 3);
+
+  out0 = (v8i16)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+  out1 = (v8i16)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+  out2 = (v8i16)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
+  out4 = (v8i16)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out1);
+  out5 = (v8i16)__msa_ilvev_b((v16i8)horiz_out4, (v16i8)horiz_out3);
+  out6 = (v8i16)__msa_ilvev_b((v16i8)horiz_out6, (v16i8)horiz_out5);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+    horiz_out7 = HORIZ_8TAP_FILT(src7, mask0, mask1, mask2, mask3, filt_horiz0,
+                                 filt_horiz1, filt_horiz2, filt_horiz3);
+
+    out3 = (v8i16)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
+    tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vert0, filt_vert1,
+                               filt_vert2, filt_vert3);
+    tmp0 = SRARI_SATURATE_SIGNED_H(tmp0, FILTER_BITS, 7);
+
+    horiz_out8 = HORIZ_8TAP_FILT(src8, mask0, mask1, mask2, mask3, filt_horiz0,
+                                 filt_horiz1, filt_horiz2, filt_horiz3);
+
+    out7 = (v8i16)__msa_ilvev_b((v16i8)horiz_out8, (v16i8)horiz_out7);
+    tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vert0, filt_vert1,
+                               filt_vert2, filt_vert3);
+    tmp1 = SRARI_SATURATE_SIGNED_H(tmp1, FILTER_BITS, 7);
+
+    horiz_out9 = HORIZ_8TAP_FILT(src9, mask0, mask1, mask2, mask3, filt_horiz0,
+                                 filt_horiz1, filt_horiz2, filt_horiz3);
+
+    out8 = (v8i16)__msa_ilvev_b((v16i8)horiz_out9, (v16i8)horiz_out8);
+    tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vert0, filt_vert1,
+                               filt_vert2, filt_vert3);
+    tmp2 = SRARI_SATURATE_SIGNED_H(tmp2, FILTER_BITS, 7);
+
+    horiz_out10 = HORIZ_8TAP_FILT(src10, mask0, mask1, mask2, mask3,
+                                  filt_horiz0, filt_horiz1, filt_horiz2,
+                                  filt_horiz3);
+
+    out9 = (v8i16)__msa_ilvev_b((v16i8)horiz_out10, (v16i8)horiz_out9);
+    tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vert0, filt_vert1,
+                               filt_vert2, filt_vert3);
+    tmp3 = SRARI_SATURATE_SIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_4_XORI128_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    horiz_out6 = horiz_out10;
+
+    out0 = out2;
+    out1 = out3;
+    out2 = out8;
+    out4 = out6;
+    out5 = out7;
+    out6 = out9;
+  }
+}
+
+static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  uint32_t out0, out1, out2, out3;
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 res0, res1, horiz_vec;
+  v16u8 filt_vert, filt_horiz, vec0, vec1;
+  v8u16 filt, tmp0, tmp1;
+  v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3, horiz_out4;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter_horiz);
+  filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LOAD_UH(filter_vert);
+  filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
+  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
+  horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+  horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
+
+  horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
+  horiz_out3 = (v8u16)__msa_pckod_d((v2i64)horiz_out4, (v2i64)horiz_out2);
+
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+
+  tmp0 = __msa_dotp_u_h(vec0, filt_vert);
+  tmp1 = __msa_dotp_u_h(vec1, filt_vert);
+  tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+  tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+  res0 = (v16u8)__msa_pckev_b((v16i8)tmp0, (v16i8)tmp0);
+  res1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp1);
+
+  out0 = __msa_copy_u_w((v4i32)res0, 0);
+  out1 = __msa_copy_u_w((v4i32)res0, 1);
+  out2 = __msa_copy_u_w((v4i32)res1, 0);
+  out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+}
+
+static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  uint32_t out0, out1, out2, out3;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  v16u8 filt_horiz, filt_vert, horiz_vec;
+  v16u8 vec0, vec1, vec2, vec3;
+  v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
+  v8u16 vec4, vec5, vec6, vec7, filt;
+  v8u16 horiz_out4, horiz_out5, horiz_out6, horiz_out7, horiz_out8;
+  v16i8 res0, res1, res2, res3;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[16]);
+
+  /* rearranging filter */
+  filt = LOAD_UH(filter_horiz);
+  filt_horiz = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  filt = LOAD_UH(filter_vert);
+  filt_vert = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LOAD_8VECS_SB(src, src_stride,
+                src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+  src8 = LOAD_SB(src);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src0);
+  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src2);
+  horiz_out2 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_out2, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src4);
+  horiz_out4 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out4 = SRARI_SATURATE_UNSIGNED_H(horiz_out4, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src6);
+  horiz_out6 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out6 = SRARI_SATURATE_UNSIGNED_H(horiz_out6, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src8, src8);
+  horiz_out8 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out8 = SRARI_SATURATE_UNSIGNED_H(horiz_out8, FILTER_BITS, 7);
+
+  horiz_out1 = (v8u16)__msa_sldi_b((v16i8)horiz_out2, (v16i8)horiz_out0, 8);
+  horiz_out3 = (v8u16)__msa_sldi_b((v16i8)horiz_out4, (v16i8)horiz_out2, 8);
+  horiz_out5 = (v8u16)__msa_sldi_b((v16i8)horiz_out6, (v16i8)horiz_out4, 8);
+  horiz_out7 = (v8u16)__msa_pckod_d((v2i64)horiz_out8, (v2i64)horiz_out6);
+
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+  vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out5, (v16i8)horiz_out4);
+  vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out7, (v16i8)horiz_out6);
+
+  vec4 = __msa_dotp_u_h(vec0, filt_vert);
+  vec5 = __msa_dotp_u_h(vec1, filt_vert);
+  vec6 = __msa_dotp_u_h(vec2, filt_vert);
+  vec7 = __msa_dotp_u_h(vec3, filt_vert);
+
+  vec4 = SRARI_SATURATE_UNSIGNED_H(vec4, FILTER_BITS, 7);
+  vec5 = SRARI_SATURATE_UNSIGNED_H(vec5, FILTER_BITS, 7);
+  vec6 = SRARI_SATURATE_UNSIGNED_H(vec6, FILTER_BITS, 7);
+  vec7 = SRARI_SATURATE_UNSIGNED_H(vec7, FILTER_BITS, 7);
+
+  res0 = __msa_pckev_b((v16i8)vec4, (v16i8)vec4);
+  res1 = __msa_pckev_b((v16i8)vec5, (v16i8)vec5);
+  res2 = __msa_pckev_b((v16i8)vec6, (v16i8)vec6);
+  res3 = __msa_pckev_b((v16i8)vec7, (v16i8)vec7);
+
+  out0 = __msa_copy_u_w((v4i32)res0, 0);
+  out1 = __msa_copy_u_w((v4i32)res0, 1);
+  out2 = __msa_copy_u_w((v4i32)res1, 0);
+  out3 = __msa_copy_u_w((v4i32)res1, 1);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+  dst += dst_stride;
+
+  out0 = __msa_copy_u_w((v4i32)res2, 0);
+  out1 = __msa_copy_u_w((v4i32)res2, 1);
+  out2 = __msa_copy_u_w((v4i32)res3, 0);
+  out3 = __msa_copy_u_w((v4i32)res3, 1);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+}
+
+static void common_hv_2ht_2vt_4w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz,
+                                     int8_t *filter_vert,
+                                     int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
+                              filter_horiz, filter_vert);
+  } else if (8 == height) {
+    common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
+                              filter_horiz, filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_horiz, filt_vert, horiz_vec;
+  v16u8 vec0, vec1, vec2, vec3;
+  v8u16 horiz_out0, horiz_out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 filt;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter_horiz);
+  filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LOAD_SH(filter_vert);
+  filt_vert = (v16u8)__msa_splati_h(filt, 0);
+
+  LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+  src += (5 * src_stride);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+  horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+  vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+  tmp0 = __msa_dotp_u_h(vec0, filt_vert);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+  vec1 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+  tmp1 = __msa_dotp_u_h(vec1, filt_vert);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+  horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+  vec2 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+  tmp2 = __msa_dotp_u_h(vec2, filt_vert);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+  vec3 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+  tmp3 = __msa_dotp_u_h(vec3, filt_vert);
+
+  tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+  tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+  tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+  tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+  PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src,
+                                          int32_t src_stride,
+                                          uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int8_t *filter_horiz,
+                                          int8_t *filter_vert,
+                                          int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, mask;
+  v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
+  v8u16 horiz_out0, horiz_out1;
+  v8u16 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  v8i16 filt;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter_horiz);
+  filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LOAD_SH(filter_vert);
+  filt_vert = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LOAD_SB(src);
+  src += src_stride;
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+  horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+    tmp2 = (v8u16)__msa_dotp_u_h(vec0, filt_vert);
+
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vert);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+    LOAD_4VECS_SB(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+    tmp4 = __msa_dotp_u_h(vec0, filt_vert);
+
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+    tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_8_BYTES_4(tmp1, tmp2, tmp3, tmp4, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+    tmp5 = __msa_dotp_u_h(vec0, filt_vert);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+    tmp6 = __msa_dotp_u_h(vec0, filt_vert);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+    horiz_out1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_out1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+    tmp7 = __msa_dotp_u_h(vec0, filt_vert);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+    horiz_out0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_out0, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+    tmp8 = __msa_dotp_u_h(vec0, filt_vert);
+
+    tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
+    tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
+    tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
+    tmp8 = SRARI_SATURATE_UNSIGNED_H(tmp8, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_8_BYTES_4(tmp5, tmp6, tmp7, tmp8, dst, dst_stride);
+    dst += (4 * dst_stride);
+  }
+}
+
+static void common_hv_2ht_2vt_8w_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (4 == height) {
+    common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else {
+    common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_16w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  v16u8 filt_horiz, filt_vert, vec0, horiz_vec;
+  v8u16 horiz_vec0, horiz_vec1, tmp1, tmp2;
+  v8u16 horiz_out0, horiz_out1, horiz_out2, horiz_out3;
+  v8i16 filt;
+
+  mask = LOAD_SB(&mc_filt_mask_arr[0]);
+
+  /* rearranging filter */
+  filt = LOAD_SH(filter_horiz);
+  filt_horiz = (v16u8)__msa_splati_h(filt, 0);
+
+  filt = LOAD_SH(filter_vert);
+  filt_vert = (v16u8)__msa_splati_h(filt, 0);
+
+  src0 = LOAD_SB(src);
+  src1 = LOAD_SB(src + 8);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+  horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+  horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+  horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+  horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6);
+    LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src0, src0);
+    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src1, src1);
+    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    dst += dst_stride;
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src2, src2);
+    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src3, src3);
+    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    dst += dst_stride;
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src4, src4);
+    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out1 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src5, src5);
+    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out3 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out1, (v16i8)horiz_out0);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out3, (v16i8)horiz_out2);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    dst += dst_stride;
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src6, src6);
+    horiz_vec0 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out0 = SRARI_SATURATE_UNSIGNED_H(horiz_vec0, FILTER_BITS, 7);
+
+    horiz_vec = (v16u8)__msa_vshf_b(mask, src7, src7);
+    horiz_vec1 = __msa_dotp_u_h(horiz_vec, filt_horiz);
+    horiz_out2 = SRARI_SATURATE_UNSIGNED_H(horiz_vec1, FILTER_BITS, 7);
+
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out0, (v16i8)horiz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vert);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)horiz_out2, (v16i8)horiz_out3);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vert);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp2, tmp1, dst);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_32w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 2; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vp9_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride,
+                       uint8_t *dst, ptrdiff_t dst_stride,
+                       const int16_t *filter_x, int32_t x_step_q4,
+                       const int16_t *filter_y, int32_t y_step_q4,
+                       int32_t w, int32_t h) {
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  if (16 != x_step_q4 || 16 != y_step_q4) {
+    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4,
+                    w, h);
+    return;
+  }
+
+  if (((const int32_t *)filter_x)[1] == 0x800000 &&
+      ((const int32_t *)filter_y)[1] == 0x800000) {
+    vp9_convolve_copy(src, src_stride, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h);
+    return;
+  }
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_x)[0] == 0 &&
+      ((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_8w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_16w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_32w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_64w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  &filt_hor[3], &filt_ver[3], (int32_t)h);
+        break;
+      default:
+        vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+        break;
+    }
+  } else if (((const int32_t *)filter_x)[0] == 0 ||
+             ((const int32_t *)filter_y)[0] == 0) {
+    vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                    filter_x, x_step_q4, filter_y, y_step_q4,
+                    w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_4w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_8w_msa(src, (int32_t)src_stride,
+                                 dst, (int32_t)dst_stride,
+                                 filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_16w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_32w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_64w_msa(src, (int32_t)src_stride,
+                                  dst, (int32_t)dst_stride,
+                                  filt_hor, filt_ver, (int32_t)h);
+        break;
+      default:
+        vp9_convolve8_c(src, src_stride, dst, dst_stride,
+                        filter_x, x_step_q4, filter_y, y_step_q4,
+                        w, h);
+        break;
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
new file mode 100644
index 00000000000..6b71ec1c0e4
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve8_vert_msa.c
@@ -0,0 +1,856 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/mips/msa/vp9_convolve_msa.h"
+
+static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+  v16i8 src2110, src4332, src6554, src8776, src10998;
+  v8i16 filt, out10, out32;
+  v16i8 filt0, filt1, filt2, filt3;
+
+  src -= (3 * src_stride);
+
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+                  src1, src3, src5, src2, src4, src6,
+                  src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
+
+  ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
+                  src6554, src65_r, src54_r);
+
+  XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+                    src76_r, src87_r, src98_r, src109_r);
+
+    ILVR_D_2VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r);
+
+    XORI_B_2VECS_SB(src8776, src10998, src8776, src10998, 128);
+
+    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776,
+                                filt0, filt1, filt2, filt3);
+    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998,
+                                filt0, filt1, filt2, filt3);
+
+    out10 = SRARI_SATURATE_SIGNED_H(out10, FILTER_BITS, 7);
+    out32 = SRARI_SATURATE_SIGNED_H(out32, FILTER_BITS, 7);
+
+    PCKEV_2B_XORI128_STORE_4_BYTES_4(out10, out32, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src6554 = src10998;
+
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+  v16i8 filt0, filt1, filt2, filt3;
+  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= (3 * src_stride);
+
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  LOAD_7VECS_SB(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
+  src += (7 * src_stride);
+
+  XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+                  src0, src1, src2, src3, src4, src5, src6, 128);
+
+  ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+                  src1, src3, src5, src2, src4, src6,
+                  src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
+    src += (4 * src_stride);
+
+    XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+    ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+                    src76_r, src87_r, src98_r, src109_r);
+
+    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                 filt0, filt1, filt2, filt3);
+    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                 filt0, filt1, filt2, filt3);
+    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                 filt0, filt1, filt2, filt3);
+    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                 filt0, filt1, filt2, filt3);
+
+    out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
+    out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
+    out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
+    out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
+
+    PCKEV_B_4_XORI128_STORE_8_BYTES_4(out0_r, out1_r, out2_r, out3_r,
+                                      dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src54_r = src98_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src65_r = src109_r;
+
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t loop_cnt, cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  v16i8 filt0, filt1, filt2, filt3;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
+  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
+  v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
+  v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
+  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+  v8i16 filt;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+
+  src -= (3 * src_stride);
+
+  filt = LOAD_SH(filter);
+  filt0 = (v16i8)__msa_splati_h(filt, 0);
+  filt1 = (v16i8)__msa_splati_h(filt, 1);
+  filt2 = (v16i8)__msa_splati_h(filt, 2);
+  filt3 = (v16i8)__msa_splati_h(filt, 3);
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    LOAD_7VECS_SB(src_tmp, src_stride,
+                  src0, src1, src2, src3, src4, src5, src6);
+    src_tmp += (7 * src_stride);
+
+    XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
+                    src0, src1, src2, src3, src4, src5, src6, 128);
+
+    ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+                    src1, src3, src5, src2, src4, src6,
+                    src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
+
+    ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
+                    src1, src3, src5, src2, src4, src6,
+                    src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
+
+    for (loop_cnt = (height >> 2); loop_cnt--;) {
+      LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10);
+      src_tmp += (4 * src_stride);
+
+      XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
+
+      ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+                      src76_r, src87_r, src98_r, src109_r);
+
+      ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
+                      src76_l, src87_l, src98_l, src109_l);
+
+      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
+                                   filt0, filt1, filt2, filt3);
+      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
+                                   filt0, filt1, filt2, filt3);
+      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
+                                   filt0, filt1, filt2, filt3);
+      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
+                                   filt0, filt1, filt2, filt3);
+
+      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
+                                   filt0, filt1, filt2, filt3);
+      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
+                                   filt0, filt1, filt2, filt3);
+      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
+                                   filt0, filt1, filt2, filt3);
+      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
+                                   filt0, filt1, filt2, filt3);
+
+      out0_r = SRARI_SATURATE_SIGNED_H(out0_r, FILTER_BITS, 7);
+      out1_r = SRARI_SATURATE_SIGNED_H(out1_r, FILTER_BITS, 7);
+      out2_r = SRARI_SATURATE_SIGNED_H(out2_r, FILTER_BITS, 7);
+      out3_r = SRARI_SATURATE_SIGNED_H(out3_r, FILTER_BITS, 7);
+      out0_l = SRARI_SATURATE_SIGNED_H(out0_l, FILTER_BITS, 7);
+      out1_l = SRARI_SATURATE_SIGNED_H(out1_l, FILTER_BITS, 7);
+      out2_l = SRARI_SATURATE_SIGNED_H(out2_l, FILTER_BITS, 7);
+      out3_l = SRARI_SATURATE_SIGNED_H(out3_l, FILTER_BITS, 7);
+
+      out0_r = (v8i16)__msa_pckev_b((v16i8)out0_l, (v16i8)out0_r);
+      out1_r = (v8i16)__msa_pckev_b((v16i8)out1_l, (v16i8)out1_r);
+      out2_r = (v8i16)__msa_pckev_b((v16i8)out2_l, (v16i8)out2_r);
+      out3_r = (v8i16)__msa_pckev_b((v16i8)out3_l, (v16i8)out3_r);
+
+      XORI_B_4VECS_UB(out0_r, out1_r, out2_r, out3_r,
+                      tmp0, tmp1, tmp2, tmp3, 128);
+
+      STORE_4VECS_UB(dst_tmp, dst_stride, tmp0, tmp1, tmp2, tmp3);
+      dst_tmp += (4 * dst_stride);
+
+      src10_r = src54_r;
+      src32_r = src76_r;
+      src54_r = src98_r;
+      src21_r = src65_r;
+      src43_r = src87_r;
+      src65_r = src109_r;
+
+      src10_l = src54_l;
+      src32_l = src76_l;
+      src54_l = src98_l;
+      src21_l = src65_l;
+      src43_l = src87_l;
+      src65_l = src109_l;
+
+      src6 = src10;
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
+                            filter, height, 16);
+}
+
+static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
+                            filter, height, 32);
+}
+
+static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride,
+                            filter, height, 64);
+}
+
+static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  uint32_t out0, out1, out2, out3;
+  v16i8 src0, src1, src2, src3, src4;
+  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
+  v16i8 filt0;
+  v8u16 filt;
+
+  filt = LOAD_UH(filter);
+  filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
+
+  LOAD_5VECS_SB(src, src_stride, src0, src1, src2, src3, src4);
+  src += (5 * src_stride);
+
+  ILVR_B_4VECS_SB(src0, src1, src2, src3, src1, src2, src3, src4,
+                  src10_r, src21_r, src32_r, src43_r);
+
+  ILVR_D_2VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r);
+
+  src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
+  src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
+
+  src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
+  src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
+
+  src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
+
+  out0 = __msa_copy_u_w((v4i32)src2110, 0);
+  out1 = __msa_copy_u_w((v4i32)src2110, 1);
+  out2 = __msa_copy_u_w((v4i32)src2110, 2);
+  out3 = __msa_copy_u_w((v4i32)src2110, 3);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+}
+
+static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  uint32_t out0, out1, out2, out3, out4, out5, out6, out7;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
+  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
+  v16i8 filt0;
+  v8u16 filt;
+
+  filt = LOAD_UH(filter);
+  filt0 = (v16i8)__msa_splati_h((v8i16)filt, 0);
+
+  LOAD_8VECS_SB(src, src_stride,
+                src0, src1, src2, src3, src4, src5, src6, src7);
+  src += (8 * src_stride);
+
+  src8 = LOAD_SB(src);
+  src += src_stride;
+
+  ILVR_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7,
+                  src1, src2, src3, src4, src5, src6, src7, src8,
+                  src10_r, src21_r, src32_r, src43_r,
+                  src54_r, src65_r, src76_r, src87_r);
+
+  ILVR_D_4VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
+                  src6554, src65_r, src54_r, src8776, src87_r, src76_r);
+
+  src2110 = (v16i8)__msa_dotp_u_h((v16u8)src2110, (v16u8)filt0);
+  src4332 = (v16i8)__msa_dotp_u_h((v16u8)src4332, (v16u8)filt0);
+  src6554 = (v16i8)__msa_dotp_u_h((v16u8)src6554, (v16u8)filt0);
+  src8776 = (v16i8)__msa_dotp_u_h((v16u8)src8776, (v16u8)filt0);
+
+  src2110 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src2110, FILTER_BITS, 7);
+  src4332 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src4332, FILTER_BITS, 7);
+  src6554 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src6554, FILTER_BITS, 7);
+  src8776 = (v16i8)SRARI_SATURATE_UNSIGNED_H(src8776, FILTER_BITS, 7);
+
+  src2110 = (v16i8)__msa_pckev_b((v16i8)src4332, (v16i8)src2110);
+  src4332 = (v16i8)__msa_pckev_b((v16i8)src8776, (v16i8)src6554);
+
+  out0 = __msa_copy_u_w((v4i32)src2110, 0);
+  out1 = __msa_copy_u_w((v4i32)src2110, 1);
+  out2 = __msa_copy_u_w((v4i32)src2110, 2);
+  out3 = __msa_copy_u_w((v4i32)src2110, 3);
+  out4 = __msa_copy_u_w((v4i32)src4332, 0);
+  out5 = __msa_copy_u_w((v4i32)src4332, 1);
+  out6 = __msa_copy_u_w((v4i32)src4332, 2);
+  out7 = __msa_copy_u_w((v4i32)src4332, 3);
+
+  STORE_WORD(dst, out0);
+  dst += dst_stride;
+  STORE_WORD(dst, out1);
+  dst += dst_stride;
+  STORE_WORD(dst, out2);
+  dst += dst_stride;
+  STORE_WORD(dst, out3);
+  dst += dst_stride;
+  STORE_WORD(dst, out4);
+  dst += dst_stride;
+  STORE_WORD(dst, out5);
+  dst += dst_stride;
+  STORE_WORD(dst, out6);
+  dst += dst_stride;
+  STORE_WORD(dst, out7);
+}
+
+static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else if (8 == height) {
+    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 vec0, vec1, vec2, vec3, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8u16 filt;
+
+  /* rearranging filter_y */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LOAD_5VECS_UB(src, src_stride, src0, src1, src2, src3, src4);
+
+  ILVR_B_2VECS_UB(src0, src1, src1, src2, vec0, vec1);
+  ILVR_B_2VECS_UB(src2, src3, src3, src4, vec2, vec3);
+
+  /* filter calc */
+  tmp0 = __msa_dotp_u_h(vec0, filt0);
+  tmp1 = __msa_dotp_u_h(vec1, filt0);
+  tmp2 = __msa_dotp_u_h(vec2, filt0);
+  tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+  tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+  tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+  tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+  tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+  PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+}
+
+static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8u16 filt;
+
+  /* rearranging filter_y */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  src0 = LOAD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 3); loop_cnt--;) {
+    LOAD_8VECS_UB(src, src_stride,
+                  src1, src2, src3, src4, src5, src6, src7, src8);
+    src += (8 * src_stride);
+
+    ILVR_B_4VECS_UB(src0, src1, src2, src3, src1, src2, src3, src4,
+                    vec0, vec1, vec2, vec3);
+
+    ILVR_B_4VECS_UB(src4, src5, src6, src7, src5, src6, src7, src8,
+                    vec4, vec5, vec6, vec7);
+
+    tmp0 = __msa_dotp_u_h(vec0, filt0);
+    tmp1 = __msa_dotp_u_h(vec1, filt0);
+    tmp2 = __msa_dotp_u_h(vec2, filt0);
+    tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    tmp0 = __msa_dotp_u_h(vec4, filt0);
+    tmp1 = __msa_dotp_u_h(vec5, filt0);
+    tmp2 = __msa_dotp_u_h(vec6, filt0);
+    tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_8_BYTES_4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (4 == height) {
+    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8u16 filt;
+
+  /* rearranging filter_y */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  src0 = LOAD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+
+    ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
+
+    tmp0 = __msa_dotp_u_h(vec0, filt0);
+    tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+    dst += dst_stride;
+
+    ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
+
+    tmp2 = __msa_dotp_u_h(vec2, filt0);
+    tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
+    dst += dst_stride;
+
+    tmp0 = __msa_dotp_u_h(vec4, filt0);
+    tmp1 = __msa_dotp_u_h(vec5, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+    dst += dst_stride;
+
+    tmp2 = __msa_dotp_u_h(vec6, filt0);
+    tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v8u16 filt;
+
+  /* rearranging filter_y */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  src0 = LOAD_UB(src);
+  src5 = LOAD_UB(src + 16);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LOAD_4VECS_UB(src, src_stride, src1, src2, src3, src4);
+
+    ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
+
+    LOAD_4VECS_UB(src + 16, src_stride, src6, src7, src8, src9);
+    src += (4 * src_stride);
+
+    tmp0 = __msa_dotp_u_h(vec0, filt0);
+    tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+
+    tmp2 = __msa_dotp_u_h(vec2, filt0);
+    tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
+
+    ILV_B_LRLR_UB(src2, src3, src3, src4, vec5, vec4, vec7, vec6);
+
+    tmp0 = __msa_dotp_u_h(vec4, filt0);
+    tmp1 = __msa_dotp_u_h(vec5, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 2 * dst_stride);
+
+    tmp2 = __msa_dotp_u_h(vec6, filt0);
+    tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 3 * dst_stride);
+
+    ILV_B_LRLR_UB(src5, src6, src6, src7, vec1, vec0, vec3, vec2);
+
+    tmp0 = __msa_dotp_u_h(vec0, filt0);
+    tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16);
+
+    tmp2 = __msa_dotp_u_h(vec2, filt0);
+    tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + dst_stride);
+
+    ILV_B_LRLR_UB(src7, src8, src8, src9, vec5, vec4, vec7, vec6);
+
+    tmp0 = __msa_dotp_u_h(vec4, filt0);
+    tmp1 = __msa_dotp_u_h(vec5, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 16 + 2 * dst_stride);
+
+    tmp2 = __msa_dotp_u_h(vec6, filt0);
+    tmp3 = __msa_dotp_u_h(vec7, filt0);
+
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 16 + 3 * dst_stride);
+    dst += (4 * dst_stride);
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 src8, src9, src10, src11;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8u16 filt;
+
+  /* rearranging filter_y */
+  filt = LOAD_UH(filter);
+  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
+
+  LOAD_4VECS_UB(src, 16, src0, src3, src6, src9);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 1); loop_cnt--;) {
+    LOAD_2VECS_UB(src, src_stride, src1, src2);
+    LOAD_2VECS_UB(src + 16, src_stride, src4, src5);
+    LOAD_2VECS_UB(src + 32, src_stride, src7, src8);
+    LOAD_2VECS_UB(src + 48, src_stride, src10, src11);
+    src += (2 * src_stride);
+
+    ILV_B_LRLR_UB(src0, src1, src1, src2, vec1, vec0, vec3, vec2);
+
+    tmp0 = __msa_dotp_u_h(vec0, filt0);
+    tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst);
+
+    tmp2 = __msa_dotp_u_h(vec2, filt0);
+    tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + dst_stride);
+
+    ILV_B_LRLR_UB(src3, src4, src4, src5, vec5, vec4, vec7, vec6);
+
+    tmp4 = __msa_dotp_u_h(vec4, filt0);
+    tmp5 = __msa_dotp_u_h(vec5, filt0);
+
+    tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
+    tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 16);
+
+    tmp6 = __msa_dotp_u_h(vec6, filt0);
+    tmp7 = __msa_dotp_u_h(vec7, filt0);
+
+    tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
+    tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 16 + dst_stride);
+
+    ILV_B_LRLR_UB(src6, src7, src7, src8, vec1, vec0, vec3, vec2);
+
+    tmp0 = __msa_dotp_u_h(vec0, filt0);
+    tmp1 = __msa_dotp_u_h(vec1, filt0);
+
+    tmp0 = SRARI_SATURATE_UNSIGNED_H(tmp0, FILTER_BITS, 7);
+    tmp1 = SRARI_SATURATE_UNSIGNED_H(tmp1, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp1, tmp0, dst + 32);
+
+    tmp2 = __msa_dotp_u_h(vec2, filt0);
+    tmp3 = __msa_dotp_u_h(vec3, filt0);
+
+    tmp2 = SRARI_SATURATE_UNSIGNED_H(tmp2, FILTER_BITS, 7);
+    tmp3 = SRARI_SATURATE_UNSIGNED_H(tmp3, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp3, tmp2, dst + 32 + dst_stride);
+
+    ILV_B_LRLR_UB(src9, src10, src10, src11, vec5, vec4, vec7, vec6);
+
+    tmp4 = __msa_dotp_u_h(vec4, filt0);
+    tmp5 = __msa_dotp_u_h(vec5, filt0);
+
+    tmp4 = SRARI_SATURATE_UNSIGNED_H(tmp4, FILTER_BITS, 7);
+    tmp5 = SRARI_SATURATE_UNSIGNED_H(tmp5, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp5, tmp4, dst + 48);
+
+    tmp6 = __msa_dotp_u_h(vec6, filt0);
+    tmp7 = __msa_dotp_u_h(vec7, filt0);
+
+    tmp6 = SRARI_SATURATE_UNSIGNED_H(tmp6, FILTER_BITS, 7);
+    tmp7 = SRARI_SATURATE_UNSIGNED_H(tmp7, FILTER_BITS, 7);
+
+    PCKEV_B_STORE_VEC(tmp7, tmp6, dst + 48 + dst_stride);
+    dst += (2 * dst_stride);
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vp9_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const int16_t *filter_x, int x_step_q4,
+                            const int16_t *filter_y, int y_step_q4,
+                            int w, int h) {
+  int8_t cnt, filt_ver[8];
+
+  if (16 != y_step_q4) {
+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                         filter_x, x_step_q4, filter_y, y_step_q4,
+                         w, h);
+    return;
+  }
+
+  if (((const int32_t *)filter_y)[1] == 0x800000) {
+    vp9_convolve_copy(src, src_stride, dst, dst_stride,
+                      filter_x, x_step_q4, filter_y, y_step_q4,
+                      w, h);
+    return;
+  }
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (((const int32_t *)filter_y)[0] == 0) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_msa(src, (int32_t)src_stride,
+                            dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_msa(src, (int32_t)src_stride,
+                             dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+                             filter_x, x_step_q4, filter_y, y_step_q4,
+                             w, h);
+        break;
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c
new file mode 100644
index 00000000000..72b8ab71397
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_avg_msa.c
@@ -0,0 +1,335 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint32_t out0, out1, out2, out3;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  if (0 == (height % 4)) {
+    for (cnt = (height / 4); cnt--;) {
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+      dst0 = __msa_aver_u_b(src0, dst0);
+      dst1 = __msa_aver_u_b(src1, dst1);
+      dst2 = __msa_aver_u_b(src2, dst2);
+      dst3 = __msa_aver_u_b(src3, dst3);
+
+      out0 = __msa_copy_u_w((v4i32)dst0, 0);
+      out1 = __msa_copy_u_w((v4i32)dst1, 0);
+      out2 = __msa_copy_u_w((v4i32)dst2, 0);
+      out3 = __msa_copy_u_w((v4i32)dst3, 0);
+
+      STORE_WORD(dst, out0);
+      dst += dst_stride;
+      STORE_WORD(dst, out1);
+      dst += dst_stride;
+      STORE_WORD(dst, out2);
+      dst += dst_stride;
+      STORE_WORD(dst, out3);
+      dst += dst_stride;
+    }
+  } else if (0 == (height % 2)) {
+    for (cnt = (height / 2); cnt--;) {
+      LOAD_2VECS_UB(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+
+      LOAD_2VECS_UB(dst, dst_stride, dst0, dst1);
+
+      dst0 = __msa_aver_u_b(src0, dst0);
+      dst1 = __msa_aver_u_b(src1, dst1);
+
+      out0 = __msa_copy_u_w((v4i32)dst0, 0);
+      out1 = __msa_copy_u_w((v4i32)dst1, 0);
+
+      STORE_WORD(dst, out0);
+      dst += dst_stride;
+      STORE_WORD(dst, out1);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
+                           uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  for (cnt = (height / 4); cnt--;) {
+    LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+
+    LOAD_4VECS_UB(dst, dst_stride, dst0, dst1, dst2, dst3);
+
+    dst0 = __msa_aver_u_b(src0, dst0);
+    dst1 = __msa_aver_u_b(src1, dst1);
+    dst2 = __msa_aver_u_b(src2, dst2);
+    dst3 = __msa_aver_u_b(src3, dst3);
+
+    out0 = __msa_copy_u_d((v2i64)dst0, 0);
+    out1 = __msa_copy_u_d((v2i64)dst1, 0);
+    out2 = __msa_copy_u_d((v2i64)dst2, 0);
+    out3 = __msa_copy_u_d((v2i64)dst3, 0);
+
+    STORE_DWORD(dst, out0);
+    dst += dst_stride;
+    STORE_DWORD(dst, out1);
+    dst += dst_stride;
+    STORE_DWORD(dst, out2);
+    dst += dst_stride;
+    STORE_DWORD(dst, out3);
+    dst += dst_stride;
+  }
+}
+
+static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+  for (cnt = (height / 8); cnt--;) {
+    LOAD_8VECS_UB(src, src_stride,
+                  src0, src1, src2, src3, src4, src5, src6, src7);
+    src += (8 * src_stride);
+
+    LOAD_8VECS_UB(dst, dst_stride,
+                  dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+
+    dst0 = __msa_aver_u_b(src0, dst0);
+    dst1 = __msa_aver_u_b(src1, dst1);
+    dst2 = __msa_aver_u_b(src2, dst2);
+    dst3 = __msa_aver_u_b(src3, dst3);
+    dst4 = __msa_aver_u_b(src4, dst4);
+    dst5 = __msa_aver_u_b(src5, dst5);
+    dst6 = __msa_aver_u_b(src6, dst6);
+    dst7 = __msa_aver_u_b(src7, dst7);
+
+    STORE_8VECS_UB(dst, dst_stride,
+                   dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+    dst += (8 * dst_stride);
+  }
+}
+
+static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *dst_dup = dst;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (cnt = (height / 8); cnt--;) {
+    src0 = LOAD_UB(src);
+    src1 = LOAD_UB(src + 16);
+    src += src_stride;
+    src2 = LOAD_UB(src);
+    src3 = LOAD_UB(src + 16);
+    src += src_stride;
+    src4 = LOAD_UB(src);
+    src5 = LOAD_UB(src + 16);
+    src += src_stride;
+    src6 = LOAD_UB(src);
+    src7 = LOAD_UB(src + 16);
+    src += src_stride;
+
+    dst0 = LOAD_UB(dst_dup);
+    dst1 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+    dst2 = LOAD_UB(dst_dup);
+    dst3 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+    dst4 = LOAD_UB(dst_dup);
+    dst5 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+    dst6 = LOAD_UB(dst_dup);
+    dst7 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+
+    src8 = LOAD_UB(src);
+    src9 = LOAD_UB(src + 16);
+    src += src_stride;
+    src10 = LOAD_UB(src);
+    src11 = LOAD_UB(src + 16);
+    src += src_stride;
+    src12 = LOAD_UB(src);
+    src13 = LOAD_UB(src + 16);
+    src += src_stride;
+    src14 = LOAD_UB(src);
+    src15 = LOAD_UB(src + 16);
+    src += src_stride;
+
+    dst8 = LOAD_UB(dst_dup);
+    dst9 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+    dst10 = LOAD_UB(dst_dup);
+    dst11 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+    dst12 = LOAD_UB(dst_dup);
+    dst13 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+    dst14 = LOAD_UB(dst_dup);
+    dst15 = LOAD_UB(dst_dup + 16);
+    dst_dup += dst_stride;
+
+    dst0 = __msa_aver_u_b(src0, dst0);
+    dst1 = __msa_aver_u_b(src1, dst1);
+    dst2 = __msa_aver_u_b(src2, dst2);
+    dst3 = __msa_aver_u_b(src3, dst3);
+    dst4 = __msa_aver_u_b(src4, dst4);
+    dst5 = __msa_aver_u_b(src5, dst5);
+    dst6 = __msa_aver_u_b(src6, dst6);
+    dst7 = __msa_aver_u_b(src7, dst7);
+    dst8 = __msa_aver_u_b(src8, dst8);
+    dst9 = __msa_aver_u_b(src9, dst9);
+    dst10 = __msa_aver_u_b(src10, dst10);
+    dst11 = __msa_aver_u_b(src11, dst11);
+    dst12 = __msa_aver_u_b(src12, dst12);
+    dst13 = __msa_aver_u_b(src13, dst13);
+    dst14 = __msa_aver_u_b(src14, dst14);
+    dst15 = __msa_aver_u_b(src15, dst15);
+
+    STORE_UB(dst0, dst);
+    STORE_UB(dst1, dst + 16);
+    dst += dst_stride;
+    STORE_UB(dst2, dst);
+    STORE_UB(dst3, dst + 16);
+    dst += dst_stride;
+    STORE_UB(dst4, dst);
+    STORE_UB(dst5, dst + 16);
+    dst += dst_stride;
+    STORE_UB(dst6, dst);
+    STORE_UB(dst7, dst + 16);
+    dst += dst_stride;
+    STORE_UB(dst8, dst);
+    STORE_UB(dst9, dst + 16);
+    dst += dst_stride;
+    STORE_UB(dst10, dst);
+    STORE_UB(dst11, dst + 16);
+    dst += dst_stride;
+    STORE_UB(dst12, dst);
+    STORE_UB(dst13, dst + 16);
+    dst += dst_stride;
+    STORE_UB(dst14, dst);
+    STORE_UB(dst15, dst + 16);
+    dst += dst_stride;
+  }
+}
+
+static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *dst_dup = dst;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
+  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (cnt = (height / 4); cnt--;) {
+    LOAD_4VECS_UB(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LOAD_4VECS_UB(src, 16, src4, src5, src6, src7);
+    src += src_stride;
+    LOAD_4VECS_UB(src, 16, src8, src9, src10, src11);
+    src += src_stride;
+    LOAD_4VECS_UB(src, 16, src12, src13, src14, src15);
+    src += src_stride;
+
+    LOAD_4VECS_UB(dst_dup, 16, dst0, dst1, dst2, dst3);
+    dst_dup += dst_stride;
+    LOAD_4VECS_UB(dst_dup, 16, dst4, dst5, dst6, dst7);
+    dst_dup += dst_stride;
+    LOAD_4VECS_UB(dst_dup, 16, dst8, dst9, dst10, dst11);
+    dst_dup += dst_stride;
+    LOAD_4VECS_UB(dst_dup, 16, dst12, dst13, dst14, dst15);
+    dst_dup += dst_stride;
+
+    dst0 = __msa_aver_u_b(src0, dst0);
+    dst1 = __msa_aver_u_b(src1, dst1);
+    dst2 = __msa_aver_u_b(src2, dst2);
+    dst3 = __msa_aver_u_b(src3, dst3);
+    dst4 = __msa_aver_u_b(src4, dst4);
+    dst5 = __msa_aver_u_b(src5, dst5);
+    dst6 = __msa_aver_u_b(src6, dst6);
+    dst7 = __msa_aver_u_b(src7, dst7);
+    dst8 = __msa_aver_u_b(src8, dst8);
+    dst9 = __msa_aver_u_b(src9, dst9);
+    dst10 = __msa_aver_u_b(src10, dst10);
+    dst11 = __msa_aver_u_b(src11, dst11);
+    dst12 = __msa_aver_u_b(src12, dst12);
+    dst13 = __msa_aver_u_b(src13, dst13);
+    dst14 = __msa_aver_u_b(src14, dst14);
+    dst15 = __msa_aver_u_b(src15, dst15);
+
+    STORE_4VECS_UB(dst, 16, dst0, dst1, dst2, dst3);
+    dst += dst_stride;
+    STORE_4VECS_UB(dst, 16, dst4, dst5, dst6, dst7);
+    dst += dst_stride;
+    STORE_4VECS_UB(dst, 16, dst8, dst9, dst10, dst11);
+    dst += dst_stride;
+    STORE_4VECS_UB(dst, 16, dst12, dst13, dst14, dst15);
+    dst += dst_stride;
+  }
+}
+
+void vp9_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int32_t filter_x_stride,
+                          const int16_t *filter_y, int32_t filter_y_stride,
+                          int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      avg_width4_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 8: {
+      avg_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      avg_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int32_t lp, cnt;
+      for (cnt = h; cnt--;) {
+        for (lp = 0; lp < w; ++lp) {
+          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+        }
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c
new file mode 100644
index 00000000000..064ba762fa0
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_copy_msa.c
@@ -0,0 +1,300 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride,
+                            int32_t height) {
+  int32_t cnt;
+  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LOAD_8VECS_UB(src, src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      STORE_DWORD(dst, out0);
+      dst += dst_stride;
+      STORE_DWORD(dst, out1);
+      dst += dst_stride;
+      STORE_DWORD(dst, out2);
+      dst += dst_stride;
+      STORE_DWORD(dst, out3);
+      dst += dst_stride;
+      STORE_DWORD(dst, out4);
+      dst += dst_stride;
+      STORE_DWORD(dst, out5);
+      dst += dst_stride;
+      STORE_DWORD(dst, out6);
+      dst += dst_stride;
+      STORE_DWORD(dst, out7);
+      dst += dst_stride;
+
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+      STORE_DWORD(dst, out0);
+      dst += dst_stride;
+      STORE_DWORD(dst, out1);
+      dst += dst_stride;
+      STORE_DWORD(dst, out2);
+      dst += dst_stride;
+      STORE_DWORD(dst, out3);
+      dst += dst_stride;
+    }
+  } else if (0 == height % 8) {
+    for (cnt = height >> 3; cnt--;) {
+      LOAD_8VECS_UB(src, src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+      out4 = __msa_copy_u_d((v2i64)src4, 0);
+      out5 = __msa_copy_u_d((v2i64)src5, 0);
+      out6 = __msa_copy_u_d((v2i64)src6, 0);
+      out7 = __msa_copy_u_d((v2i64)src7, 0);
+
+      STORE_DWORD(dst, out0);
+      dst += dst_stride;
+      STORE_DWORD(dst, out1);
+      dst += dst_stride;
+      STORE_DWORD(dst, out2);
+      dst += dst_stride;
+      STORE_DWORD(dst, out3);
+      dst += dst_stride;
+      STORE_DWORD(dst, out4);
+      dst += dst_stride;
+      STORE_DWORD(dst, out5);
+      dst += dst_stride;
+      STORE_DWORD(dst, out6);
+      dst += dst_stride;
+      STORE_DWORD(dst, out7);
+      dst += dst_stride;
+    }
+  } else if (0 == height % 4) {
+    for (cnt = (height / 4); cnt--;) {
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+      out2 = __msa_copy_u_d((v2i64)src2, 0);
+      out3 = __msa_copy_u_d((v2i64)src3, 0);
+
+      STORE_DWORD(dst, out0);
+      dst += dst_stride;
+      STORE_DWORD(dst, out1);
+      dst += dst_stride;
+      STORE_DWORD(dst, out2);
+      dst += dst_stride;
+      STORE_DWORD(dst, out3);
+      dst += dst_stride;
+    }
+  } else if (0 == height % 2) {
+    for (cnt = (height / 2); cnt--;) {
+      LOAD_2VECS_UB(src, src_stride, src0, src1);
+      src += (2 * src_stride);
+
+      out0 = __msa_copy_u_d((v2i64)src0, 0);
+      out1 = __msa_copy_u_d((v2i64)src1, 0);
+
+      STORE_DWORD(dst, out0);
+      dst += dst_stride;
+      STORE_DWORD(dst, out1);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  const uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      LOAD_8VECS_UB(src_tmp, src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7);
+      src_tmp += (8 * src_stride);
+
+      STORE_8VECS_UB(dst_tmp, dst_stride,
+                     src0, src1, src2, src3, src4, src5, src6, src7);
+      dst_tmp += (8 * dst_stride);
+    }
+
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LOAD_8VECS_UB(src, src_stride,
+                    src0, src1, src2, src3, src4, src5, src6, src7);
+      src += (8 * src_stride);
+
+      STORE_8VECS_UB(dst, dst_stride,
+                     src0, src1, src2, src3, src4, src5, src6, src7);
+      dst += (8 * dst_stride);
+
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      src += (4 * src_stride);
+
+      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height) {
+  int32_t cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+  if (0 == height % 12) {
+    for (cnt = (height / 12); cnt--;) {
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+
+      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      dst += (4 * dst_stride);
+
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+
+      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      dst += (4 * dst_stride);
+
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+
+      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      dst += (4 * dst_stride);
+    }
+  } else if (0 == height % 8) {
+    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
+  } else if (0 == height % 4) {
+    for (cnt = (height >> 2); cnt--;) {
+      LOAD_4VECS_UB(src, src_stride, src0, src1, src2, src3);
+      LOAD_4VECS_UB(src + 16, src_stride, src4, src5, src6, src7);
+      src += (4 * src_stride);
+
+      STORE_4VECS_UB(dst, dst_stride, src0, src1, src2, src3);
+      STORE_4VECS_UB(dst + 16, dst_stride, src4, src5, src6, src7);
+      dst += (4 * dst_stride);
+    }
+  }
+}
+
+static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride,
+                             int32_t height) {
+  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vp9_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int32_t filter_x_stride,
+                           const int16_t *filter_y, int32_t filter_y_stride,
+                           int32_t w, int32_t h) {
+  (void)filter_x;
+  (void)filter_y;
+  (void)filter_x_stride;
+  (void)filter_y_stride;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt, tmp;
+      /* 1 word storage */
+      for (cnt = h; cnt--;) {
+        tmp = LOAD_WORD(src);
+        STORE_WORD(dst, tmp);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_msa(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
new file mode 100644
index 00000000000..b109a4014f8
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_convolve_msa.h
@@ -0,0 +1,157 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
+#define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
+
+#include "vp9/common/vp9_filter.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+extern const uint8_t mc_filt_mask_arr[16 * 3];
+
+#define HORIZ_8TAP_FILT(src, mask0, mask1, mask2, mask3,                   \
+                        filt_h0, filt_h1, filt_h2, filt_h3) ({             \
+  v8i16 vec0, vec1, vec2, vec3, horiz_out;                                 \
+                                                                           \
+  vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src), (v16i8)(src));  \
+  vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0));                    \
+  vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src), (v16i8)(src));  \
+  vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1);             \
+  vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src), (v16i8)(src));  \
+  vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2));                    \
+  vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src), (v16i8)(src));  \
+  vec2 = __msa_dpadd_s_h(vec2, (v16i8)(filt_h3), (v16i8)vec3);             \
+  vec0 = __msa_adds_s_h(vec0, vec2);                                       \
+  horiz_out = SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7);               \
+                                                                           \
+  horiz_out;                                                               \
+})
+
+#define HORIZ_8TAP_FILT_2VECS(src0, src1, mask0, mask1, mask2, mask3,        \
+                              filt_h0, filt_h1, filt_h2, filt_h3) ({         \
+  v8i16 vec0, vec1, vec2, vec3, horiz_out;                                   \
+                                                                             \
+  vec0 = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0));  \
+  vec0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)(filt_h0));                      \
+  vec1 = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0));  \
+  vec0 = __msa_dpadd_s_h(vec0, (v16i8)(filt_h1), (v16i8)vec1);               \
+  vec2 = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0));  \
+  vec2 = __msa_dotp_s_h((v16i8)vec2, (v16i8)(filt_h2));                      \
+  vec3 = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0));  \
+  vec2 = __msa_dpadd_s_h(vec2, ((v16i8)filt_h3), (v16i8)vec3);               \
+  vec0 = __msa_adds_s_h(vec0, vec2);                                         \
+  horiz_out = (v8i16)SRARI_SATURATE_SIGNED_H(vec0, FILTER_BITS, 7);          \
+                                                                             \
+  horiz_out;                                                                 \
+})
+
+#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,             \
+                            filt0, filt1, filt2, filt3) ({      \
+  v8i16 tmp0, tmp1;                                             \
+                                                                \
+  tmp0 = __msa_dotp_s_h((v16i8)(vec0), (v16i8)(filt0));         \
+  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)(vec1), (v16i8)(filt1));  \
+  tmp1 = __msa_dotp_s_h((v16i8)(vec2), (v16i8)(filt2));         \
+  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)(vec3), ((v16i8)filt3));  \
+  tmp0 = __msa_adds_s_h(tmp0, tmp1);                            \
+                                                                \
+  tmp0;                                                         \
+})
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,                     \
+                                   mask0, mask1, mask2, mask3,                 \
+                                   filt0, filt1, filt2, filt3,                 \
+                                   out0, out1) {                               \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;        \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                                        \
+                                                                               \
+  vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src0));  \
+  vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src2));  \
+                                                                               \
+  res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0));                      \
+  res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0));                      \
+                                                                               \
+  vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src0));  \
+  vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src2));  \
+                                                                               \
+  res0_m = __msa_dpadd_s_h(res0_m, (filt1), (v16i8)vec2_m);                    \
+  res1_m = __msa_dpadd_s_h(res1_m, (filt1), (v16i8)vec3_m);                    \
+                                                                               \
+  vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src0));  \
+  vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src2));  \
+                                                                               \
+  res2_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec4_m);                      \
+  res3_m = __msa_dotp_s_h((v16i8)(filt2), (v16i8)vec5_m);                      \
+                                                                               \
+  vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src0));  \
+  vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src2));  \
+                                                                               \
+  res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt3), (v16i8)vec6_m);             \
+  res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt3), (v16i8)vec7_m);             \
+                                                                               \
+  out0 = __msa_adds_s_h(res0_m, res2_m);                                       \
+  out1 = __msa_adds_s_h(res1_m, res3_m);                                       \
+}
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                     \
+                                   mask0, mask1, mask2, mask3,                 \
+                                   filt0, filt1, filt2, filt3,                 \
+                                   out0, out1, out2, out3) {                   \
+  v8i16 vec0_m, vec1_m, vec2_m, vec3_m;                                        \
+  v8i16 vec4_m, vec5_m, vec6_m, vec7_m;                                        \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                                        \
+  v8i16 res4_m, res5_m, res6_m, res7_m;                                        \
+                                                                               \
+  vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src0), (v16i8)(src0));  \
+  vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src1), (v16i8)(src1));  \
+  vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src2), (v16i8)(src2));  \
+  vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask0), (v16i8)(src3), (v16i8)(src3));  \
+                                                                               \
+  res0_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt0));                      \
+  res1_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt0));                      \
+  res2_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt0));                      \
+  res3_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt0));                      \
+                                                                               \
+  vec0_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src0), (v16i8)(src0));  \
+  vec1_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src1), (v16i8)(src1));  \
+  vec2_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src2), (v16i8)(src2));  \
+  vec3_m = (v8i16)__msa_vshf_b((v16i8)(mask2), (v16i8)(src3), (v16i8)(src3));  \
+                                                                               \
+  res4_m = __msa_dotp_s_h((v16i8)vec0_m, (v16i8)(filt2));                      \
+  res5_m = __msa_dotp_s_h((v16i8)vec1_m, (v16i8)(filt2));                      \
+  res6_m = __msa_dotp_s_h((v16i8)vec2_m, (v16i8)(filt2));                      \
+  res7_m = __msa_dotp_s_h((v16i8)vec3_m, (v16i8)(filt2));                      \
+                                                                               \
+  vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src0), (v16i8)(src0));  \
+  vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src1), (v16i8)(src1));  \
+  vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src2), (v16i8)(src2));  \
+  vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask1), (v16i8)(src3), (v16i8)(src3));  \
+                                                                               \
+  res0_m = __msa_dpadd_s_h(res0_m, (v16i8)(filt1), (v16i8)vec4_m);             \
+  res1_m = __msa_dpadd_s_h(res1_m, (v16i8)(filt1), (v16i8)vec5_m);             \
+  res2_m = __msa_dpadd_s_h(res2_m, (v16i8)(filt1), (v16i8)vec6_m);             \
+  res3_m = __msa_dpadd_s_h(res3_m, (v16i8)(filt1), (v16i8)vec7_m);             \
+                                                                               \
+  vec4_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src0), (v16i8)(src0));  \
+  vec5_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src1), (v16i8)(src1));  \
+  vec6_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src2), (v16i8)(src2));  \
+  vec7_m = (v8i16)__msa_vshf_b((v16i8)(mask3), (v16i8)(src3), (v16i8)(src3));  \
+                                                                               \
+  res4_m = __msa_dpadd_s_h(res4_m, (v16i8)(filt3), (v16i8)vec4_m);             \
+  res5_m = __msa_dpadd_s_h(res5_m, (v16i8)(filt3), (v16i8)vec5_m);             \
+  res6_m = __msa_dpadd_s_h(res6_m, (v16i8)(filt3), (v16i8)vec6_m);             \
+  res7_m = __msa_dpadd_s_h(res7_m, (v16i8)(filt3), (v16i8)vec7_m);             \
+                                                                               \
+  out0 = __msa_adds_s_h(res0_m, res4_m);                                       \
+  out1 = __msa_adds_s_h(res1_m, res5_m);                                       \
+  out2 = __msa_adds_s_h(res2_m, res6_m);                                       \
+  out3 = __msa_adds_s_h(res3_m, res7_m);                                       \
+}
+#endif  /* VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
new file mode 100644
index 00000000000..e5c0eaa32f3
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
@@ -0,0 +1,948 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define SET_COSPI_PAIR(c0_h, c1_h) ({  \
+  v8i16 out0, r0_m, r1_m;              \
+                                       \
+  r0_m = __msa_fill_h(c0_h);           \
+  r1_m = __msa_fill_h(c1_h);           \
+  out0 = __msa_ilvev_h(r1_m, r0_m);    \
+                                       \
+  out0;                                \
+})
+
+#define DOTP_CONST_PAIR(reg0, reg1, const0, const1, out0, out1) {  \
+  v8i16 k0_m = __msa_fill_h(const0);                               \
+  v8i16 s0_m, s1_m, s2_m, s3_m;                                    \
+                                                                   \
+  s0_m = __msa_fill_h(const1);                                     \
+  k0_m = __msa_ilvev_h(s0_m, k0_m);                                \
+                                                                   \
+  s0_m = __msa_ilvl_h(-reg1, reg0);                                \
+  s1_m = __msa_ilvr_h(-reg1, reg0);                                \
+  s2_m = __msa_ilvl_h(reg0, reg1);                                 \
+  s3_m = __msa_ilvr_h(reg0, reg1);                                 \
+  s1_m = (v8i16)__msa_dotp_s_w(s1_m, k0_m);                        \
+  s0_m = (v8i16)__msa_dotp_s_w(s0_m, k0_m);                        \
+  s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS);        \
+  s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS);        \
+  out0 = __msa_pckev_h(s0_m, s1_m);                                \
+                                                                   \
+  s1_m = (v8i16)__msa_dotp_s_w(s3_m, k0_m);                        \
+  s0_m = (v8i16)__msa_dotp_s_w(s2_m, k0_m);                        \
+  s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS);        \
+  s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS);        \
+  out1 = __msa_pckev_h(s0_m, s1_m);                                \
+}
+
+#define VP9_MADD_SHORT(m0, m1, c0, c1, res0, res1) {      \
+  v4i32 madd0_m, madd1_m, madd2_m, madd3_m;               \
+  v8i16 madd_s0_m, madd_s1_m;                             \
+                                                          \
+  ILV_H_LR_SH(m0, m1, madd_s1_m, madd_s0_m);              \
+                                                          \
+  DOTP_S_W_4VECS_SW(madd_s0_m, c0, madd_s1_m, c0,         \
+                    madd_s0_m, c1, madd_s1_m, c1,         \
+                    madd0_m, madd1_m, madd2_m, madd3_m);  \
+                                                          \
+  SRARI_W_4VECS_SW(madd0_m, madd1_m, madd2_m, madd3_m,    \
+                   madd0_m, madd1_m, madd2_m, madd3_m,    \
+                   DCT_CONST_BITS);                       \
+                                                          \
+  PCKEV_H_2VECS_SH(madd1_m, madd0_m, madd3_m, madd2_m,    \
+                   res0, res1);                           \
+}
+
+#define VP9_MADD_BF(inp0, inp1, inp2, inp3,                   \
+                    cst0, cst1, cst2, cst3,                   \
+                    out0, out1, out2, out3) {                 \
+  v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m;           \
+  v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                       \
+  v4i32 m4_m, m5_m;                                           \
+                                                              \
+  ILV_H_LRLR_SH(inp0, inp1, inp2, inp3,                       \
+                madd_s1_m, madd_s0_m, madd_s3_m, madd_s2_m);  \
+                                                              \
+  DOTP_S_W_4VECS_SW(madd_s0_m, cst0, madd_s1_m, cst0,         \
+                    madd_s2_m, cst2, madd_s3_m, cst2,         \
+                    tmp0_m, tmp1_m, tmp2_m, tmp3_m);          \
+                                                              \
+  m4_m = tmp0_m + tmp2_m;                                     \
+  m5_m = tmp1_m + tmp3_m;                                     \
+  tmp3_m = tmp1_m - tmp3_m;                                   \
+  tmp2_m = tmp0_m - tmp2_m;                                   \
+                                                              \
+  SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m,                \
+                   m4_m, m5_m, tmp2_m, tmp3_m,                \
+                   DCT_CONST_BITS);                           \
+                                                              \
+  PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out0, out1);   \
+                                                              \
+  DOTP_S_W_4VECS_SW(madd_s0_m, cst1, madd_s1_m, cst1,         \
+                    madd_s2_m, cst3, madd_s3_m, cst3,         \
+                    tmp0_m, tmp1_m, tmp2_m, tmp3_m);          \
+                                                              \
+  m4_m = tmp0_m + tmp2_m;                                     \
+  m5_m = tmp1_m + tmp3_m;                                     \
+  tmp3_m = tmp1_m - tmp3_m;                                   \
+  tmp2_m = tmp0_m - tmp2_m;                                   \
+                                                              \
+  SRARI_W_4VECS_SW(m4_m, m5_m, tmp2_m, tmp3_m,                \
+                   m4_m, m5_m, tmp2_m, tmp3_m,                \
+                   DCT_CONST_BITS);                           \
+                                                              \
+  PCKEV_H_2VECS_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);   \
+}
+
+#define TRANSPOSE8x8_H1(in0, in1, in2, in3,                   \
+                        in4, in5, in6, in7,                   \
+                        out0, out1, out2, out3,               \
+                        out4, out5, out6, out7) {             \
+  v8i16 loc0_m, loc1_m;                                       \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                       \
+  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                       \
+                                                              \
+  loc0_m = __msa_ilvr_h((in6), (in4));                        \
+  loc1_m = __msa_ilvr_h((in7), (in5));                        \
+  tmp0_m = __msa_ilvr_h(loc1_m, loc0_m);                      \
+  tmp1_m = __msa_ilvl_h(loc1_m, loc0_m);                      \
+                                                              \
+  loc0_m = __msa_ilvl_h((in6), (in4));                        \
+  loc1_m = __msa_ilvl_h((in7), (in5));                        \
+  tmp2_m = __msa_ilvr_h(loc1_m, loc0_m);                      \
+  tmp3_m = __msa_ilvl_h(loc1_m, loc0_m);                      \
+                                                              \
+  loc0_m = __msa_ilvr_h((in2), (in0));                        \
+  loc1_m = __msa_ilvr_h((in3), (in1));                        \
+  tmp4_m = __msa_ilvr_h(loc1_m, loc0_m);                      \
+  tmp5_m = __msa_ilvl_h(loc1_m, loc0_m);                      \
+                                                              \
+  loc0_m = __msa_ilvl_h((in2), (in0));                        \
+  loc1_m = __msa_ilvl_h((in3), (in1));                        \
+  tmp6_m = __msa_ilvr_h(loc1_m, loc0_m);                      \
+  tmp7_m = __msa_ilvl_h(loc1_m, loc0_m);                      \
+                                                              \
+  out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m);  \
+  out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);  \
+  out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m);  \
+  out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);  \
+  out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m);  \
+  out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);  \
+  out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m);  \
+  out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);  \
+}
+
+#define VP9_IADST8x16_1D(r0, r1, r2, r3, r4, r5, r6, r7,                  \
+                         r8, r9, r10, r11, r12, r13, r14, r15,            \
+                         out0, out1, out2, out3, out4, out5, out6, out7,  \
+                         out8, out9, out10, out11,                        \
+                         out12, out13, out14, out15) {                    \
+  v8i16 g0_m, g1_m, g2_m, g3_m, g4_m, g5_m, g6_m, g7_m;                   \
+  v8i16 g8_m, g9_m, g10_m, g11_m, g12_m, g13_m, g14_m, g15_m;             \
+  v8i16 h0_m, h1_m, h2_m, h3_m, h4_m, h5_m, h6_m, h7_m;                   \
+  v8i16 h8_m, h9_m, h10_m, h11_m;                                         \
+  v8i16 k0_m, k1_m, k2_m, k3_m;                                           \
+                                                                          \
+  /* stage 1 */                                                           \
+  k0_m = SET_COSPI_PAIR(cospi_1_64, cospi_31_64);                         \
+  k1_m = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);                        \
+  k2_m = SET_COSPI_PAIR(cospi_17_64, cospi_15_64);                        \
+  k3_m = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);                       \
+  VP9_MADD_BF(r15, r0, r7, r8, k0_m, k1_m, k2_m, k3_m,                    \
+              g0_m, g1_m, g2_m, g3_m);                                    \
+                                                                          \
+  k0_m = SET_COSPI_PAIR(cospi_5_64, cospi_27_64);                         \
+  k1_m = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);                        \
+  k2_m = SET_COSPI_PAIR(cospi_21_64, cospi_11_64);                        \
+  k3_m = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);                       \
+  VP9_MADD_BF(r13, r2, r5, r10, k0_m, k1_m, k2_m, k3_m,                   \
+              g4_m, g5_m, g6_m, g7_m);                                    \
+                                                                          \
+  k0_m = SET_COSPI_PAIR(cospi_9_64, cospi_23_64);                         \
+  k1_m = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);                        \
+  k2_m = SET_COSPI_PAIR(cospi_25_64, cospi_7_64);                         \
+  k3_m = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);                        \
+  VP9_MADD_BF(r11, r4, r3, r12, k0_m, k1_m, k2_m, k3_m,                   \
+              g8_m, g9_m, g10_m, g11_m);                                  \
+                                                                          \
+  k0_m = SET_COSPI_PAIR(cospi_13_64, cospi_19_64);                        \
+  k1_m = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);                       \
+  k2_m = SET_COSPI_PAIR(cospi_29_64, cospi_3_64);                         \
+  k3_m = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);                        \
+  VP9_MADD_BF(r9, r6, r1, r14, k0_m, k1_m, k2_m, k3_m,                    \
+              g12_m, g13_m, g14_m, g15_m);                                \
+                                                                          \
+  /* stage 2 */                                                           \
+  k0_m = SET_COSPI_PAIR(cospi_4_64, cospi_28_64);                         \
+  k1_m = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);                        \
+  k2_m = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);                        \
+  VP9_MADD_BF(g1_m, g3_m, g9_m, g11_m, k0_m, k1_m, k2_m, k0_m,            \
+              h0_m, h1_m, h2_m, h3_m);                                    \
+                                                                          \
+  k0_m = SET_COSPI_PAIR(cospi_12_64, cospi_20_64);                        \
+  k1_m = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);                       \
+  k2_m = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);                       \
+  VP9_MADD_BF(g7_m, g5_m, g15_m, g13_m, k0_m, k1_m, k2_m, k0_m,           \
+              h4_m, h5_m, h6_m, h7_m);                                    \
+                                                                          \
+  BUTTERFLY_4(h0_m, h2_m, h6_m, h4_m, out8, out9, out11, out10);          \
+                                                                          \
+  BUTTERFLY_8(g0_m, g2_m, g4_m, g6_m, g14_m, g12_m, g10_m, g8_m,          \
+              h8_m, h9_m, h10_m, h11_m, h6_m, h4_m, h2_m, h0_m);          \
+                                                                          \
+  /* stage 3 */                                                           \
+  BUTTERFLY_4(h8_m, h9_m, h11_m, h10_m, out0, out1, h11_m, h10_m);        \
+                                                                          \
+  k0_m = SET_COSPI_PAIR(cospi_8_64, cospi_24_64);                         \
+  k1_m = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);                        \
+  k2_m = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);                        \
+  VP9_MADD_BF(h0_m, h2_m, h4_m, h6_m, k0_m, k1_m, k2_m, k0_m,             \
+              out4, out6, out5, out7);                                    \
+  VP9_MADD_BF(h1_m, h3_m, h5_m, h7_m, k0_m, k1_m, k2_m, k0_m,             \
+              out12, out14, out13, out15);                                \
+                                                                          \
+  /* stage 4 */                                                           \
+  k0_m = SET_COSPI_PAIR(cospi_16_64, cospi_16_64);                        \
+  k1_m = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);                      \
+  k2_m = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);                       \
+  k3_m = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);                       \
+  VP9_MADD_SHORT(h10_m, h11_m, k1_m, k2_m, out2, out3);                   \
+  VP9_MADD_SHORT(out6, out7, k0_m, k3_m, out6, out7);                     \
+  VP9_MADD_SHORT(out10, out11, k0_m, k3_m, out10, out11);                 \
+  VP9_MADD_SHORT(out14, out15, k1_m, k2_m, out14, out15);                 \
+}
+
+#define VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride,     \
+                                            in0, in1, in2, in3) {  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                         \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                            \
+  v16u8 dest0_m, dest1_m, dest2_m, dest3_m;                        \
+  v16i8 tmp0_m, tmp1_m;                                            \
+  v16i8 zero_m = { 0 };                                            \
+  uint8_t *dst_m = (uint8_t *)(dest);                              \
+                                                                   \
+  LOAD_4VECS_UB(dst_m, (dest_stride),                              \
+                dest0_m, dest1_m, dest2_m, dest3_m);               \
+                                                                   \
+  res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m);            \
+  res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m);            \
+  res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m);            \
+  res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m);            \
+                                                                   \
+  res0_m += (v8i16)(in0);                                          \
+  res1_m += (v8i16)(in1);                                          \
+  res2_m += (v8i16)(in2);                                          \
+  res3_m += (v8i16)(in3);                                          \
+                                                                   \
+  res0_m = CLIP_UNSIGNED_CHAR_H(res0_m);                           \
+  res1_m = CLIP_UNSIGNED_CHAR_H(res1_m);                           \
+  res2_m = CLIP_UNSIGNED_CHAR_H(res2_m);                           \
+  res3_m = CLIP_UNSIGNED_CHAR_H(res3_m);                           \
+                                                                   \
+  tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m);            \
+  tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m);            \
+                                                                   \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                       \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                       \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                       \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                       \
+                                                                   \
+  STORE_DWORD(dst_m, out0_m);                                      \
+  dst_m += (dest_stride);                                          \
+  STORE_DWORD(dst_m, out1_m);                                      \
+  dst_m += (dest_stride);                                          \
+  STORE_DWORD(dst_m, out2_m);                                      \
+  dst_m += (dest_stride);                                          \
+  STORE_DWORD(dst_m, out3_m);                                      \
+}
+
+void vp9_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
+  v8i16 loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+  v8i16 tmp5, tmp6, tmp7;
+
+  /* load left top 8x8 */
+  LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  /* load right top 8x8 */
+  LOAD_8VECS_SH((input + 8), 16,
+                reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+  /* transpose block */
+  TRANSPOSE8x8_H1(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                  reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  /* transpose block */
+  TRANSPOSE8x8_H1(reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15,
+                  reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+
+  loc0 = reg2 + reg10;
+  reg2 = reg2 - reg10;
+  loc1 = reg14 + reg6;
+  reg14 = reg14 - reg6;
+
+  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+
+  reg14 = reg8 - reg12;
+  reg2 = reg8 + reg12;
+  reg10 = reg0 - reg4;
+  reg6 = reg0 + reg4;
+
+  reg0 = reg2 - loc1;
+  reg2 = reg2 + loc1;
+  reg12 = reg14 - loc0;
+  reg14 = reg14 + loc0;
+  reg4 = reg6 - loc3;
+  reg6 = reg6 + loc3;
+  reg8 = reg10 - loc2;
+  reg10 = reg10 + loc2;
+
+  /* stage 2 */
+  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+  reg9 = reg1 - loc2;
+  reg1 = reg1 + loc2;
+  reg7 = reg15 - loc3;
+  reg15 = reg15 + loc3;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+
+  reg13 = loc0 + reg5;
+  reg5 = loc0 - reg5;
+  reg3 = loc1 + reg11;
+  reg11 = loc1 - reg11;
+
+  loc1 = reg15 + reg3;
+  reg3 = reg15 - reg3;
+  loc2 = reg2 + loc1;
+  reg15 = reg2 - loc1;
+
+  loc1 = reg1 + reg13;
+  reg13 = reg1 - reg13;
+  loc0 = reg0 + loc1;
+  loc1 = reg0 - loc1;
+  tmp6 = loc0;
+  tmp7 = loc1;
+  reg0 = loc2;
+
+  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+  loc0 = reg9 + reg5;
+  reg5 = reg9 - reg5;
+  reg2 = reg6 + loc0;
+  reg1 = reg6 - loc0;
+
+  loc0 = reg7 + reg11;
+  reg11 = reg7 - reg11;
+  loc1 = reg4 + loc0;
+  loc2 = reg4 - loc0;
+  tmp5 = loc1;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+
+  loc0 = reg8 + reg5;
+  loc1 = reg8 - reg5;
+  reg4 = reg10 + reg11;
+  reg9 = reg10 - reg11;
+  reg10 = loc0;
+  reg11 = loc1;
+
+  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+
+  reg8 = reg12 + reg3;
+  reg5 = reg12 - reg3;
+  reg6 = reg14 + reg13;
+  reg7 = reg14 - reg13;
+  reg13 = loc2;
+
+  /* Transpose and store the output */
+  reg12 = tmp5;
+  reg14 = tmp6;
+  reg3 = tmp7;
+
+  /* transpose block */
+  TRANSPOSE8x8_H1(reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14,
+                  reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+
+  STORE_8VECS_SH(output, 16, reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14);
+
+  /* transpose block */
+  TRANSPOSE8x8_H1(reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15,
+                  reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+
+  STORE_8VECS_SH((output + 8), 16,
+                 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15);
+}
+
+void vp9_idct16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest,
+                                      int32_t dest_stride) {
+  v8i16 loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg2, reg4, reg6, reg8, reg10, reg12, reg14;
+  v8i16 reg3, reg13, reg11, reg5, reg7, reg9, reg1, reg15;
+  v8i16 tmp5, tmp6, tmp7;
+
+  /* load up 8x8 */
+  LOAD_8VECS_SH(input, 16, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  /* load bottom 8x8 */
+  LOAD_8VECS_SH((input + 8 * 16), 16,
+                reg8, reg9, reg10, reg11, reg12, reg13, reg14, reg15);
+
+  DOTP_CONST_PAIR(reg2, reg14, cospi_28_64, cospi_4_64, reg2, reg14);
+  DOTP_CONST_PAIR(reg10, reg6, cospi_12_64, cospi_20_64, reg10, reg6);
+
+  loc0 = reg2 + reg10;
+  reg2 = reg2 - reg10;
+  loc1 = reg14 + reg6;
+  reg14 = reg14 - reg6;
+
+  DOTP_CONST_PAIR(reg14, reg2, cospi_16_64, cospi_16_64, loc2, loc3);
+  DOTP_CONST_PAIR(reg0, reg8, cospi_16_64, cospi_16_64, reg0, reg8);
+  DOTP_CONST_PAIR(reg4, reg12, cospi_24_64, cospi_8_64, reg4, reg12);
+
+  reg14 = reg8 - reg12;
+  reg2 = reg8 + reg12;
+  reg10 = reg0 - reg4;
+  reg6 = reg0 + reg4;
+
+  reg0 = reg2 - loc1;
+  reg2 = reg2 + loc1;
+  reg12 = reg14 - loc0;
+  reg14 = reg14 + loc0;
+  reg4 = reg6 - loc3;
+  reg6 = reg6 + loc3;
+  reg8 = reg10 - loc2;
+  reg10 = reg10 + loc2;
+
+  /* stage 2 */
+  DOTP_CONST_PAIR(reg1, reg15, cospi_30_64, cospi_2_64, reg1, reg15);
+  DOTP_CONST_PAIR(reg9, reg7, cospi_14_64, cospi_18_64, loc2, loc3);
+
+  reg9 = reg1 - loc2;
+  reg1 = reg1 + loc2;
+  reg7 = reg15 - loc3;
+  reg15 = reg15 + loc3;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_22_64, cospi_10_64, reg5, reg11);
+  DOTP_CONST_PAIR(reg13, reg3, cospi_6_64, cospi_26_64, loc0, loc1);
+
+  reg13 = loc0 + reg5;
+  reg5 = loc0 - reg5;
+  reg3 = loc1 + reg11;
+  reg11 = loc1 - reg11;
+
+  loc1 = reg15 + reg3;
+  reg3 = reg15 - reg3;
+  loc2 = reg2 + loc1;
+  reg15 = reg2 - loc1;
+
+  loc1 = reg1 + reg13;
+  reg13 = reg1 - reg13;
+  loc0 = reg0 + loc1;
+  loc1 = reg0 - loc1;
+  tmp6 = loc0;
+  tmp7 = loc1;
+  reg0 = loc2;
+
+  DOTP_CONST_PAIR(reg7, reg9, cospi_24_64, cospi_8_64, reg7, reg9);
+  DOTP_CONST_PAIR((-reg5), (-reg11), cospi_8_64, cospi_24_64, reg5, reg11);
+
+  loc0 = reg9 + reg5;
+  reg5 = reg9 - reg5;
+  reg2 = reg6 + loc0;
+  reg1 = reg6 - loc0;
+
+  loc0 = reg7 + reg11;
+  reg11 = reg7 - reg11;
+  loc1 = reg4 + loc0;
+  loc2 = reg4 - loc0;
+  tmp5 = loc1;
+
+  DOTP_CONST_PAIR(reg5, reg11, cospi_16_64, cospi_16_64, reg5, reg11);
+
+  loc0 = reg8 + reg5;
+  loc1 = reg8 - reg5;
+  reg4 = reg10 + reg11;
+  reg9 = reg10 - reg11;
+  reg10 = loc0;
+  reg11 = loc1;
+
+  DOTP_CONST_PAIR(reg3, reg13, cospi_16_64, cospi_16_64, reg3, reg13);
+
+  reg8 = reg12 + reg3;
+  reg5 = reg12 - reg3;
+  reg6 = reg14 + reg13;
+  reg7 = reg14 - reg13;
+  reg13 = loc2;
+
+  /* Transpose and store the output */
+  reg12 = tmp5;
+  reg14 = tmp6;
+  reg3 = tmp7;
+
+  SRARI_H_4VECS_SH(reg0, reg2, reg4, reg6, reg0, reg2, reg4, reg6, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4(dest, dest_stride,
+                                      reg0, reg2, reg4, reg6);
+  SRARI_H_4VECS_SH(reg8, reg10, reg12, reg14, reg8, reg10, reg12, reg14, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (4 * dest_stride)),
+                                      dest_stride, reg8, reg10, reg12, reg14);
+  SRARI_H_4VECS_SH(reg3, reg13, reg11, reg5, reg3, reg13, reg11, reg5, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (8 * dest_stride)),
+                                      dest_stride, reg3, reg13, reg11, reg5);
+  SRARI_H_4VECS_SH(reg7, reg9, reg1, reg15, reg7, reg9, reg1, reg15, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_8_BYTES_4((dest + (12 * dest_stride)),
+                                      dest_stride, reg7, reg9, reg1, reg15);
+}
+
+void vp9_idct16x16_256_add_msa(const int16_t *input, uint8_t *dest,
+                               int32_t dest_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+  int16_t *out = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 2; ++i) {
+    /* process 16 * 8 block */
+    vp9_idct16_1d_rows_msa((input + (i << 7)), (out + (i << 7)));
+  }
+
+  /* transform columns */
+  for (i = 0; i < 2; ++i) {
+    /* process 8 * 16 block */
+    vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)),
+                                     dest_stride);
+  }
+}
+
+void vp9_idct16x16_10_add_msa(const int16_t *input, uint8_t *dest,
+                              int32_t dest_stride) {
+  uint8_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[16 * 16]);
+  int16_t *out = out_arr;
+
+  /* process 16 * 8 block */
+  vp9_idct16_1d_rows_msa(input, out);
+
+  /* short case just considers top 4 rows as valid output */
+  out += 4 * 16;
+  for (i = 12; i--;) {
+    __asm__ __volatile__ (
+        "sw     $zero,   0(%[out])     \n\t"
+        "sw     $zero,   4(%[out])     \n\t"
+        "sw     $zero,   8(%[out])     \n\t"
+        "sw     $zero,  12(%[out])     \n\t"
+        "sw     $zero,  16(%[out])     \n\t"
+        "sw     $zero,  20(%[out])     \n\t"
+        "sw     $zero,  24(%[out])     \n\t"
+        "sw     $zero,  28(%[out])     \n\t"
+
+        :
+        : [out] "r" (out)
+    );
+
+    out += 16;
+  }
+
+  out = out_arr;
+
+  /* transform columns */
+  for (i = 0; i < 2; ++i) {
+    /* process 8 * 16 block */
+    vp9_idct16_1d_columns_addblk_msa((out + (i << 3)), (dest + (i << 3)),
+                                     dest_stride);
+  }
+}
+
+void vp9_idct16x16_1_add_msa(const int16_t *input, uint8_t *dest,
+                             int32_t dest_stride) {
+  uint8_t i;
+  int32_t const1;
+  int16_t out;
+  v8i16 const2, res0, res1, res2, res3, res4, res5, res6, res7;
+  v16u8 dest0, dest1, dest2, dest3;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16i8 zero = { 0 };
+
+  out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  const1 = ROUND_POWER_OF_TWO(out, 6);
+
+  const2 = __msa_fill_h(const1);
+
+  for (i = 0; i < 4; ++i) {
+    LOAD_4VECS_UB(dest, dest_stride, dest0, dest1, dest2, dest3);
+
+    res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0);
+    res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1);
+    res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2);
+    res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3);
+    res4 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest0);
+    res5 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest1);
+    res6 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest2);
+    res7 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest3);
+
+    res0 += const2;
+    res1 += const2;
+    res2 += const2;
+    res3 += const2;
+    res4 += const2;
+    res5 += const2;
+    res6 += const2;
+    res7 += const2;
+
+    res0 = CLIP_UNSIGNED_CHAR_H(res0);
+    res1 = CLIP_UNSIGNED_CHAR_H(res1);
+    res2 = CLIP_UNSIGNED_CHAR_H(res2);
+    res3 = CLIP_UNSIGNED_CHAR_H(res3);
+    res4 = CLIP_UNSIGNED_CHAR_H(res4);
+    res5 = CLIP_UNSIGNED_CHAR_H(res5);
+    res6 = CLIP_UNSIGNED_CHAR_H(res6);
+    res7 = CLIP_UNSIGNED_CHAR_H(res7);
+
+    tmp0 = (v16u8)__msa_pckev_b((v16i8)res4, (v16i8)res0);
+    tmp1 = (v16u8)__msa_pckev_b((v16i8)res5, (v16i8)res1);
+    tmp2 = (v16u8)__msa_pckev_b((v16i8)res6, (v16i8)res2);
+    tmp3 = (v16u8)__msa_pckev_b((v16i8)res7, (v16i8)res3);
+
+    STORE_4VECS_UB(dest, dest_stride, tmp0, tmp1, tmp2, tmp3);
+    dest += (4 * dest_stride);
+  }
+}
+
+static void vp9_iadst16_1d_rows_msa(const int16_t *input, int16_t *output) {
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 l0, l1, l2, l3, l4, l5, l6, l7, l8, l9, l10, l11, l12, l13, l14, l15;
+
+  /* load input data */
+  LOAD_16VECS_SH(input, 8,
+                 l0, l8, l1, l9, l2, l10, l3, l11,
+                 l4, l12, l5, l13, l6, l14, l7, l15);
+
+  TRANSPOSE8x8_H_SH(l0, l1, l2, l3, l4, l5, l6, l7,
+                    l0, l1, l2, l3, l4, l5, l6, l7);
+
+  TRANSPOSE8x8_H_SH(l8, l9, l10, l11, l12, l13, l14, l15,
+                    l8, l9, l10, l11, l12, l13, l14, l15);
+
+  /* ADST in horizontal */
+  VP9_IADST8x16_1D(l0, l1, l2, l3, l4, l5, l6, l7,
+                   l8, l9, l10, l11, l12, l13, l14, l15,
+                   r0, r1, r2, r3, r4, r5, r6, r7,
+                   r8, r9, r10, r11, r12, r13, r14, r15);
+
+  l1 = -r8;
+  l3 = -r4;
+  l13 = -r13;
+  l15 = -r1;
+
+  TRANSPOSE8x8_H_SH(r0, l1, r12, l3, r6, r14, r10, r2,
+                    l0, l1, l2, l3, l4, l5, l6, l7);
+
+  STORE_8VECS_SH(output, 16, l0, l1, l2, l3, l4, l5, l6, l7);
+
+  TRANSPOSE8x8_H_SH(r3, r11, r15, r7, r5, l13, r9, l15,
+                    l8, l9, l10, l11, l12, l13, l14, l15);
+
+  STORE_8VECS_SH((output + 8), 16, l8, l9, l10, l11, l12, l13, l14, l15);
+}
+
+static void vp9_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dest,
+                                              int32_t dest_stride) {
+  v8i16 v0, v2, v4, v6, k0, k1, k2, k3;
+  v8i16 r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+  v8i16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8i16 out8, out9, out10, out11, out12, out13, out14, out15;
+  v8i16 g0, g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15;
+  v8i16 h0, h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+  v8i16 res8, res9, res10, res11, res12, res13, res14, res15;
+  v16u8 dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7;
+  v16u8 dest8, dest9, dest10, dest11, dest12, dest13, dest14, dest15;
+  v16i8 zero = { 0 };
+
+  r0 = LOAD_SH(input + 0 * 16);
+  r3 = LOAD_SH(input + 3 * 16);
+  r4 = LOAD_SH(input + 4 * 16);
+  r7 = LOAD_SH(input + 7 * 16);
+  r8 = LOAD_SH(input + 8 * 16);
+  r11 = LOAD_SH(input + 11 * 16);
+  r12 = LOAD_SH(input + 12 * 16);
+  r15 = LOAD_SH(input + 15 * 16);
+
+  /* stage 1 */
+  k0 = SET_COSPI_PAIR(cospi_1_64, cospi_31_64);
+  k1 = SET_COSPI_PAIR(cospi_31_64, -cospi_1_64);
+  k2 = SET_COSPI_PAIR(cospi_17_64, cospi_15_64);
+  k3 = SET_COSPI_PAIR(cospi_15_64, -cospi_17_64);
+  VP9_MADD_BF(r15, r0, r7, r8, k0, k1, k2, k3, g0, g1, g2, g3);
+
+  k0 = SET_COSPI_PAIR(cospi_9_64, cospi_23_64);
+  k1 = SET_COSPI_PAIR(cospi_23_64, -cospi_9_64);
+  k2 = SET_COSPI_PAIR(cospi_25_64, cospi_7_64);
+  k3 = SET_COSPI_PAIR(cospi_7_64, -cospi_25_64);
+  VP9_MADD_BF(r11, r4, r3, r12, k0, k1, k2, k3, g8, g9, g10, g11);
+
+  BUTTERFLY_4(g0, g2, g10, g8, h8, h9, v2, v0);
+
+  k0 = SET_COSPI_PAIR(cospi_4_64, cospi_28_64);
+  k1 = SET_COSPI_PAIR(cospi_28_64, -cospi_4_64);
+  k2 = SET_COSPI_PAIR(-cospi_28_64, cospi_4_64);
+  VP9_MADD_BF(g1, g3, g9, g11, k0, k1, k2, k0, h0, h1, h2, h3);
+
+  r1 = LOAD_SH(input + 1 * 16);
+  r2 = LOAD_SH(input + 2 * 16);
+  r5 = LOAD_SH(input + 5 * 16);
+  r6 = LOAD_SH(input + 6 * 16);
+  r9 = LOAD_SH(input + 9 * 16);
+  r10 = LOAD_SH(input + 10 * 16);
+  r13 = LOAD_SH(input + 13 * 16);
+  r14 = LOAD_SH(input + 14 * 16);
+
+  k0 = SET_COSPI_PAIR(cospi_5_64, cospi_27_64);
+  k1 = SET_COSPI_PAIR(cospi_27_64, -cospi_5_64);
+  k2 = SET_COSPI_PAIR(cospi_21_64, cospi_11_64);
+  k3 = SET_COSPI_PAIR(cospi_11_64, -cospi_21_64);
+  VP9_MADD_BF(r13, r2, r5, r10, k0, k1, k2, k3, g4, g5, g6, g7);
+
+  k0 = SET_COSPI_PAIR(cospi_13_64, cospi_19_64);
+  k1 = SET_COSPI_PAIR(cospi_19_64, -cospi_13_64);
+  k2 = SET_COSPI_PAIR(cospi_29_64, cospi_3_64);
+  k3 = SET_COSPI_PAIR(cospi_3_64, -cospi_29_64);
+  VP9_MADD_BF(r9, r6, r1, r14, k0, k1, k2, k3, g12, g13, g14, g15);
+
+  BUTTERFLY_4(g4, g6, g14, g12, h10, h11, v6, v4);
+
+  BUTTERFLY_4(h8, h9, h11, h10, out0, out1, h11, h10);
+  out1 = -out1;
+  out0 = __msa_srari_h(out0, 6);
+  out1 = __msa_srari_h(out1, 6);
+  dest0 = LOAD_UB(dest + 0 * dest_stride);
+  dest1 = LOAD_UB(dest + 15 * dest_stride);
+  res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0);
+  res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1);
+  res0 += out0;
+  res1 += out1;
+  res0 = CLIP_UNSIGNED_CHAR_H(res0);
+  res1 = CLIP_UNSIGNED_CHAR_H(res1);
+  res0 = (v8i16)__msa_pckev_b((v16i8)res0, (v16i8)res0);
+  res1 = (v8i16)__msa_pckev_b((v16i8)res1, (v16i8)res1);
+  STORE_DWORD(dest, __msa_copy_u_d((v2i64)res0, 0));
+  STORE_DWORD(dest + 15 * dest_stride, __msa_copy_u_d((v2i64)res1, 0));
+
+  k0 = SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
+  k1 = SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
+  k2 = SET_COSPI_PAIR(cospi_20_64, -cospi_12_64);
+  VP9_MADD_BF(g7, g5, g15, g13, k0, k1, k2, k0, h4, h5, h6, h7);
+
+  BUTTERFLY_4(h0, h2, h6, h4, out8, out9, out11, out10);
+  out8 = -out8;
+
+  out8 = __msa_srari_h(out8, 6);
+  out9 = __msa_srari_h(out9, 6);
+  dest8 = LOAD_UB(dest + 1 * dest_stride);
+  dest9 = LOAD_UB(dest + 14 * dest_stride);
+  res8 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest8);
+  res9 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest9);
+  res8 += out8;
+  res9 += out9;
+  res8 = CLIP_UNSIGNED_CHAR_H(res8);
+  res9 = CLIP_UNSIGNED_CHAR_H(res9);
+  res8 = (v8i16)__msa_pckev_b((v16i8)res8, (v16i8)res8);
+  res9 = (v8i16)__msa_pckev_b((v16i8)res9, (v16i8)res9);
+  STORE_DWORD(dest + dest_stride, __msa_copy_u_d((v2i64)res8, 0));
+  STORE_DWORD(dest + 14 * dest_stride, __msa_copy_u_d((v2i64)res9, 0));
+
+  k0 = SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
+  k1 = SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
+  k2 = SET_COSPI_PAIR(-cospi_24_64, cospi_8_64);
+  VP9_MADD_BF(v0, v2, v4, v6, k0, k1, k2, k0, out4, out6, out5, out7);
+  out4 = -out4;
+  out4 = __msa_srari_h(out4, 6);
+  out5 = __msa_srari_h(out5, 6);
+  dest4 = LOAD_UB(dest + 3 * dest_stride);
+  dest5 = LOAD_UB(dest + 12 * dest_stride);
+  res4 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest4);
+  res5 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest5);
+  res4 += out4;
+  res5 += out5;
+  res4 = CLIP_UNSIGNED_CHAR_H(res4);
+  res5 = CLIP_UNSIGNED_CHAR_H(res5);
+  res4 = (v8i16)__msa_pckev_b((v16i8)res4, (v16i8)res4);
+  res5 = (v8i16)__msa_pckev_b((v16i8)res5, (v16i8)res5);
+  STORE_DWORD(dest + 3 * dest_stride, __msa_copy_u_d((v2i64)res4, 0));
+  STORE_DWORD(dest + 12 * dest_stride, __msa_copy_u_d((v2i64)res5, 0));
+
+  VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
+  out13 = -out13;
+  out12 = __msa_srari_h(out12, 6);
+  out13 = __msa_srari_h(out13, 6);
+  dest12 = LOAD_UB(dest + 2 * dest_stride);
+  dest13 = LOAD_UB(dest + 13 * dest_stride);
+  res12 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest12);
+  res13 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest13);
+  res12 += out12;
+  res13 += out13;
+  res12 = CLIP_UNSIGNED_CHAR_H(res12);
+  res13 = CLIP_UNSIGNED_CHAR_H(res13);
+  res12 = (v8i16)__msa_pckev_b((v16i8)res12, (v16i8)res12);
+  res13 = (v8i16)__msa_pckev_b((v16i8)res13, (v16i8)res13);
+  STORE_DWORD(dest + 2 * dest_stride, __msa_copy_u_d((v2i64)res12, 0));
+  STORE_DWORD(dest + 13 * dest_stride, __msa_copy_u_d((v2i64)res13, 0));
+
+  k0 = SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
+  k3 = SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
+  VP9_MADD_SHORT(out6, out7, k0, k3, out6, out7);
+  out6 = __msa_srari_h(out6, 6);
+  out7 = __msa_srari_h(out7, 6);
+  dest6 = LOAD_UB(dest + 4 * dest_stride);
+  dest7 = LOAD_UB(dest + 11 * dest_stride);
+  res6 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest6);
+  res7 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest7);
+  res6 += out6;
+  res7 += out7;
+  res6 = CLIP_UNSIGNED_CHAR_H(res6);
+  res7 = CLIP_UNSIGNED_CHAR_H(res7);
+  res6 = (v8i16)__msa_pckev_b((v16i8)res6, (v16i8)res6);
+  res7 = (v8i16)__msa_pckev_b((v16i8)res7, (v16i8)res7);
+  STORE_DWORD(dest + 4 * dest_stride, __msa_copy_u_d((v2i64)res6, 0));
+  STORE_DWORD(dest + 11 * dest_stride, __msa_copy_u_d((v2i64)res7, 0));
+
+  VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
+  out10 = __msa_srari_h(out10, 6);
+  out11 = __msa_srari_h(out11, 6);
+  dest10 = LOAD_UB(dest + 6 * dest_stride);
+  dest11 = LOAD_UB(dest + 9 * dest_stride);
+  res10 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest10);
+  res11 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest11);
+  res10 += out10;
+  res11 += out11;
+  res10 = CLIP_UNSIGNED_CHAR_H(res10);
+  res11 = CLIP_UNSIGNED_CHAR_H(res11);
+  res10 = (v8i16)__msa_pckev_b((v16i8)res10, (v16i8)res10);
+  res11 = (v8i16)__msa_pckev_b((v16i8)res11, (v16i8)res11);
+  STORE_DWORD(dest + 6 * dest_stride, __msa_copy_u_d((v2i64)res10, 0));
+  STORE_DWORD(dest + 9 * dest_stride, __msa_copy_u_d((v2i64)res11, 0));
+
+  k1 = SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
+  k2 = SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
+  VP9_MADD_SHORT(h10, h11, k1, k2, out2, out3);
+  out2 = __msa_srari_h(out2, 6);
+  out3 = __msa_srari_h(out3, 6);
+  dest2 = LOAD_UB(dest + 7 * dest_stride);
+  dest3 = LOAD_UB(dest + 8 * dest_stride);
+  res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2);
+  res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3);
+  res2 += out2;
+  res3 += out3;
+  res2 = CLIP_UNSIGNED_CHAR_H(res2);
+  res3 = CLIP_UNSIGNED_CHAR_H(res3);
+  res2 = (v8i16)__msa_pckev_b((v16i8)res2, (v16i8)res2);
+  res3 = (v8i16)__msa_pckev_b((v16i8)res3, (v16i8)res3);
+  STORE_DWORD(dest + 7 * dest_stride, __msa_copy_u_d((v2i64)res2, 0));
+  STORE_DWORD(dest + 8 * dest_stride, __msa_copy_u_d((v2i64)res3, 0));
+
+  VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
+  out14 = __msa_srari_h(out14, 6);
+  out15 = __msa_srari_h(out15, 6);
+  dest14 = LOAD_UB(dest + 5 * dest_stride);
+  dest15 = LOAD_UB(dest + 10 * dest_stride);
+  res14 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest14);
+  res15 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest15);
+  res14 += out14;
+  res15 += out15;
+  res14 = CLIP_UNSIGNED_CHAR_H(res14);
+  res15 = CLIP_UNSIGNED_CHAR_H(res15);
+  res14 = (v8i16)__msa_pckev_b((v16i8)res14, (v16i8)res14);
+  res15 = (v8i16)__msa_pckev_b((v16i8)res15, (v16i8)res15);
+  STORE_DWORD(dest + 5 * dest_stride, __msa_copy_u_d((v2i64)res14, 0));
+  STORE_DWORD(dest + 10 * dest_stride, __msa_copy_u_d((v2i64)res15, 0));
+}
+
+void vp9_iht16x16_256_add_msa(const int16_t *input, uint8_t *dest,
+                              int32_t dest_stride, int32_t tx_type) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+  int16_t *out_ptr = &out[0];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                         (dest + (i << 3)), dest_stride);
+      }
+      break;
+    case ADST_DCT:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vp9_idct16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dest + (i << 3)), dest_stride);
+      }
+      break;
+    case DCT_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        /* process 8 * 16 block */
+        vp9_idct16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                         (dest + (i << 3)), dest_stride);
+      }
+      break;
+    case ADST_ADST:
+      /* transform rows */
+      for (i = 0; i < 2; ++i) {
+        /* process 16 * 8 block */
+        vp9_iadst16_1d_rows_msa((input + (i << 7)), (out_ptr + (i << 7)));
+      }
+
+      /* transform columns */
+      for (i = 0; i < 2; ++i) {
+        vp9_iadst16_1d_columns_addblk_msa((out_ptr + (i << 3)),
+                                          (dest + (i << 3)), dest_stride);
+      }
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c
new file mode 100644
index 00000000000..f576b50ea07
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_idct32x32_msa.c
@@ -0,0 +1,1077 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/msa/vp9_macros_msa.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, const0, const1, out0, out1) {  \
+  v8i16 k0_m = __msa_fill_h(const0);                               \
+  v8i16 s0_m, s1_m, s2_m, s3_m;                                    \
+                                                                   \
+  s0_m = __msa_fill_h(const1);                                     \
+  k0_m = __msa_ilvev_h(s0_m, k0_m);                                \
+                                                                   \
+  s0_m = __msa_ilvl_h(-reg1, reg0);                                \
+  s1_m = __msa_ilvr_h(-reg1, reg0);                                \
+  s2_m = __msa_ilvl_h(reg0, reg1);                                 \
+  s3_m = __msa_ilvr_h(reg0, reg1);                                 \
+  s1_m = (v8i16)__msa_dotp_s_w(s1_m, k0_m);                        \
+  s0_m = (v8i16)__msa_dotp_s_w(s0_m, k0_m);                        \
+  s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS);        \
+  s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS);        \
+  out0 = __msa_pckev_h(s0_m, s1_m);                                \
+                                                                   \
+  s1_m = (v8i16)__msa_dotp_s_w(s3_m, k0_m);                        \
+  s0_m = (v8i16)__msa_dotp_s_w(s2_m, k0_m);                        \
+  s1_m = (v8i16)__msa_srari_w((v4i32)s1_m, DCT_CONST_BITS);        \
+  s0_m = (v8i16)__msa_srari_w((v4i32)s0_m, DCT_CONST_BITS);        \
+  out1 = __msa_pckev_h(s0_m, s1_m);                                \
+}
+
+#define VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS(dest, dest_stride,     \
+                                              in0, in1, in2, in3) {  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                           \
+  v8i16 res0_m, res1_m, res2_m, res3_m;                              \
+  v16u8 dest0_m, dest1_m, dest2_m, dest3_m;                          \
+  v16i8 tmp0_m, tmp1_m;                                              \
+  v16i8 zero_m = { 0 };                                              \
+  uint8_t *dst_m = (uint8_t *)(dest);                                \
+                                                                     \
+  dest0_m = LOAD_UB(dst_m);                                          \
+  dest1_m = LOAD_UB(dst_m + 4 * dest_stride);                        \
+  dest2_m = LOAD_UB(dst_m + 8 * dest_stride);                        \
+  dest3_m = LOAD_UB(dst_m + 12 * dest_stride);                       \
+                                                                     \
+  res0_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest0_m);              \
+  res1_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest1_m);              \
+  res2_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest2_m);              \
+  res3_m = (v8i16)__msa_ilvr_b(zero_m, (v16i8)dest3_m);              \
+                                                                     \
+  res0_m += (v8i16)(in0);                                            \
+  res1_m += (v8i16)(in1);                                            \
+  res2_m += (v8i16)(in2);                                            \
+  res3_m += (v8i16)(in3);                                            \
+                                                                     \
+  res0_m = CLIP_UNSIGNED_CHAR_H(res0_m);                             \
+  res1_m = CLIP_UNSIGNED_CHAR_H(res1_m);                             \
+  res2_m = CLIP_UNSIGNED_CHAR_H(res2_m);                             \
+  res3_m = CLIP_UNSIGNED_CHAR_H(res3_m);                             \
+                                                                     \
+  tmp0_m = __msa_pckev_b((v16i8)res1_m, (v16i8)res0_m);              \
+  tmp1_m = __msa_pckev_b((v16i8)res3_m, (v16i8)res2_m);              \
+                                                                     \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                         \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                         \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                         \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                         \
+                                                                     \
+  STORE_DWORD(dst_m, out0_m);                                        \
+  dst_m += (4 * dest_stride);                                        \
+  STORE_DWORD(dst_m, out1_m);                                        \
+  dst_m += (4 * dest_stride);                                        \
+  STORE_DWORD(dst_m, out2_m);                                        \
+  dst_m += (4 * dest_stride);                                        \
+  STORE_DWORD(dst_m, out3_m);                                        \
+}
+
+static void vp9_idct32x8_row_transpose_store(const int16_t *input,
+                                             int16_t *tmp_buf) {
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7;
+  v8i16 n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* 1st & 2nd 8x8 */
+  LOAD_8VECS_SH(input, 32, m0, n0, m1, n1, m2, n2, m3, n3);
+  LOAD_8VECS_SH((input + 8), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                    m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                    m4, n4, m5, n5, m6, n6, m7, n7);
+  STORE_4VECS_SH((tmp_buf), 8, m0, n0, m1, n1);
+  STORE_4VECS_SH((tmp_buf + 4 * 8), 8, m2, n2, m3, n3);
+  STORE_4VECS_SH((tmp_buf + 8 * 8), 8, m4, n4, m5, n5);
+  STORE_4VECS_SH((tmp_buf + 12 * 8), 8, m6, n6, m7, n7);
+
+  /* 3rd & 4th 8x8 */
+  LOAD_8VECS_SH((input + 16), 32, m0, n0, m1, n1, m2, n2, m3, n3);
+  LOAD_8VECS_SH((input + 24), 32, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                    m0, n0, m1, n1, m2, n2, m3, n3);
+  TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                    m4, n4, m5, n5, m6, n6, m7, n7);
+  STORE_4VECS_SH((tmp_buf + 16 * 8), 8, m0, n0, m1, n1);
+  STORE_4VECS_SH((tmp_buf + 20 * 8), 8, m2, n2, m3, n3);
+  STORE_4VECS_SH((tmp_buf + 24 * 8), 8, m4, n4, m5, n5);
+  STORE_4VECS_SH((tmp_buf + 28 * 8), 8, m6, n6, m7, n7);
+}
+
+static void vp9_idct32x8_row_even_process_store(int16_t *tmp_buf,
+                                                int16_t *tmp_eve_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+  /* Even stage 1 */
+  LOAD_8VECS_SH(tmp_buf, 32, reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+
+  vec0 = reg1 - reg5;
+  vec1 = reg1 + reg5;
+  vec2 = reg7 - reg3;
+  vec3 = reg7 + reg3;
+
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+
+  vec0 = reg4 - reg6;
+  vec1 = reg4 + reg6;
+  vec2 = reg0 - reg2;
+  vec3 = reg0 + reg2;
+
+  stp4 = vec0 - loc0;
+  stp3 = vec0 + loc0;
+  stp7 = vec1 - loc1;
+  stp0 = vec1 + loc1;
+  stp5 = vec2 - loc2;
+  stp2 = vec2 + loc2;
+  stp6 = vec3 - loc3;
+  stp1 = vec3 + loc3;
+
+  /* Even stage 2 */
+  LOAD_8VECS_SH((tmp_buf + 16), 32,
+                reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = reg0 + reg4;
+  reg0 = reg0 - reg4;
+  reg4 = reg6 + reg2;
+  reg6 = reg6 - reg2;
+  reg2 = reg1 + reg5;
+  reg1 = reg1 - reg5;
+  reg5 = reg7 + reg3;
+  reg7 = reg7 - reg3;
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = reg3 + reg4;
+  reg3 = reg3 - reg4;
+  reg4 = reg5 - vec1;
+  reg5 = reg5 + vec1;
+
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = reg0 - reg6;
+  reg0 = reg0 + reg6;
+  vec1 = reg7 - reg1;
+  reg7 = reg7 + reg1;
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  loc0 = stp0 - reg5;
+  loc1 = stp0 + reg5;
+  loc2 = stp1 - reg7;
+  loc3 = stp1 + reg7;
+  STORE_SH(loc0, (tmp_eve_buf + 15 * 8));
+  STORE_SH(loc1, (tmp_eve_buf));
+  STORE_SH(loc2, (tmp_eve_buf + 14 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 8));
+
+  loc0 = stp2 - reg1;
+  loc1 = stp2 + reg1;
+  loc2 = stp3 - reg4;
+  loc3 = stp3 + reg4;
+  STORE_SH(loc0, (tmp_eve_buf + 13 * 8));
+  STORE_SH(loc1, (tmp_eve_buf + 2 * 8));
+  STORE_SH(loc2, (tmp_eve_buf + 12 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+  /* Store 8 */
+  loc0 = stp4 - reg3;
+  loc1 = stp4 + reg3;
+  loc2 = stp5 - reg6;
+  loc3 = stp5 + reg6;
+  STORE_SH(loc0, (tmp_eve_buf + 11 * 8));
+  STORE_SH(loc1, (tmp_eve_buf + 4 * 8));
+  STORE_SH(loc2, (tmp_eve_buf + 10 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+  loc0 = stp6 - reg0;
+  loc1 = stp6 + reg0;
+  loc2 = stp7 - reg2;
+  loc3 = stp7 + reg2;
+  STORE_SH(loc0, (tmp_eve_buf + 9 * 8));
+  STORE_SH(loc1, (tmp_eve_buf + 6 * 8));
+  STORE_SH(loc2, (tmp_eve_buf + 8 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void vp9_idct32x8_row_odd_process_store(int16_t *tmp_buf,
+                                               int16_t *tmp_odd_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  reg0 = LOAD_SH(tmp_buf + 8);
+  reg1 = LOAD_SH(tmp_buf + 7 * 8);
+  reg2 = LOAD_SH(tmp_buf + 9 * 8);
+  reg3 = LOAD_SH(tmp_buf + 15 * 8);
+  reg4 = LOAD_SH(tmp_buf + 17 * 8);
+  reg5 = LOAD_SH(tmp_buf + 23 * 8);
+  reg6 = LOAD_SH(tmp_buf + 25 * 8);
+  reg7 = LOAD_SH(tmp_buf + 31 * 8);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = reg0 + reg3;
+  reg0 = reg0 - reg3;
+  reg3 = reg7 + reg4;
+  reg7 = reg7 - reg4;
+  reg4 = reg1 + reg2;
+  reg1 = reg1 - reg2;
+  reg2 = reg6 + reg5;
+  reg6 = reg6 - reg5;
+  reg5 = vec0;
+
+  /* 4 Stores */
+  vec0 = reg5 + reg4;
+  vec1 = reg3 + reg2;
+  STORE_SH(vec0, (tmp_odd_buf + 4 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 5 * 8));
+
+  vec0 = reg5 - reg4;
+  vec1 = reg3 - reg2;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  STORE_SH(vec0, (tmp_odd_buf));
+  STORE_SH(vec1, (tmp_odd_buf + 8));
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+
+  vec0 = reg0 + reg1;
+  vec2 = reg7 - reg6;
+  vec1 = reg7 + reg6;
+  vec3 = reg0 - reg1;
+  STORE_SH(vec0, (tmp_odd_buf + 6 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 7 * 8));
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  STORE_SH(vec2, (tmp_odd_buf + 2 * 8));
+  STORE_SH(vec3, (tmp_odd_buf + 3 * 8));
+
+  /* Odd stage 2 */
+
+  /* 8 loads */
+  reg0 = LOAD_SH(tmp_buf + 3 * 8);
+  reg1 = LOAD_SH(tmp_buf + 5 * 8);
+  reg2 = LOAD_SH(tmp_buf + 11 * 8);
+  reg3 = LOAD_SH(tmp_buf + 13 * 8);
+  reg4 = LOAD_SH(tmp_buf + 19 * 8);
+  reg5 = LOAD_SH(tmp_buf + 21 * 8);
+  reg6 = LOAD_SH(tmp_buf + 27 * 8);
+  reg7 = LOAD_SH(tmp_buf + 29 * 8);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  vec0 = reg1 - reg2;
+  vec1 = reg6 - reg5;
+  vec2 = reg0 - reg3;
+  vec3 = reg7 - reg4;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  vec2 = loc2 - loc0;
+  vec3 = loc3 - loc1;
+  vec0 = loc2 + loc0;
+  vec1 = loc3 + loc1;
+  STORE_SH(vec0, (tmp_odd_buf + 12 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 15 * 8));
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+
+  STORE_SH(vec0, (tmp_odd_buf + 10 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 11 * 8));
+
+  /* 4 Stores */
+  vec0 = reg0 + reg3;
+  vec1 = reg1 + reg2;
+  vec2 = reg6 + reg5;
+  vec3 = reg7 + reg4;
+  reg0 = vec0 + vec1;
+  reg1 = vec3 + vec2;
+  reg2 = vec0 - vec1;
+  reg3 = vec3 - vec2;
+  STORE_SH(reg0, (tmp_odd_buf + 13 * 8));
+  STORE_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+
+  STORE_SH(reg0, (tmp_odd_buf + 8 * 8));
+  STORE_SH(reg1, (tmp_odd_buf + 9 * 8));
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+  /* Load 8 & Store 8 */
+  reg0 = LOAD_SH(tmp_odd_buf);
+  reg1 = LOAD_SH(tmp_odd_buf + 1 * 8);
+  reg2 = LOAD_SH(tmp_odd_buf + 2 * 8);
+  reg3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+  reg4 = LOAD_SH(tmp_odd_buf + 8 * 8);
+  reg5 = LOAD_SH(tmp_odd_buf + 9 * 8);
+  reg6 = LOAD_SH(tmp_odd_buf + 10 * 8);
+  reg7 = LOAD_SH(tmp_odd_buf + 11 * 8);
+
+  loc0 = reg0 + reg4;
+  loc1 = reg1 + reg5;
+  loc2 = reg2 + reg6;
+  loc3 = reg3 + reg7;
+  STORE_SH(loc0, (tmp_odd_buf));
+  STORE_SH(loc1, (tmp_odd_buf + 1 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 2 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 3 * 8));
+
+  vec0 = reg0 - reg4;
+  vec1 = reg1 - reg5;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  vec0 = reg2 - reg6;
+  vec1 = reg3 - reg7;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  STORE_SH(loc0, (tmp_odd_buf + 8 * 8));
+  STORE_SH(loc1, (tmp_odd_buf + 9 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 10 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 11 * 8));
+
+  /* Load 8 & Store 8 */
+  reg1 = LOAD_SH(tmp_odd_buf + 4 * 8);
+  reg2 = LOAD_SH(tmp_odd_buf + 5 * 8);
+  reg0 = LOAD_SH(tmp_odd_buf + 6 * 8);
+  reg3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+  reg4 = LOAD_SH(tmp_odd_buf + 12 * 8);
+  reg5 = LOAD_SH(tmp_odd_buf + 13 * 8);
+  reg6 = LOAD_SH(tmp_odd_buf + 14 * 8);
+  reg7 = LOAD_SH(tmp_odd_buf + 15 * 8);
+
+  loc0 = reg0 + reg4;
+  loc1 = reg1 + reg5;
+  loc2 = reg2 + reg6;
+  loc3 = reg3 + reg7;
+  STORE_SH(loc0, (tmp_odd_buf + 4 * 8));
+  STORE_SH(loc1, (tmp_odd_buf + 5 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 6 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 7 * 8));
+
+  vec0 = reg0 - reg4;
+  vec1 = reg3 - reg7;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  vec0 = reg1 - reg5;
+  vec1 = reg2 - reg6;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  STORE_SH(loc0, (tmp_odd_buf + 12 * 8));
+  STORE_SH(loc1, (tmp_odd_buf + 13 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 14 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 15 * 8));
+}
+
+static void vp9_idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf,
+                                               int16_t *tmp_odd_buf,
+                                               int16_t *dest) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7;
+  v8i16 n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  /* Total: 32 loads, 32 stores */
+  vec0 = LOAD_SH(tmp_odd_buf);
+  vec1 = LOAD_SH(tmp_odd_buf + 9 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 14 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 6 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf);
+  loc1 = LOAD_SH(tmp_eve_buf + 8 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 4 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 12 * 8);
+
+  m0 = (loc0 + vec3);
+  STORE_SH((loc0 - vec3), (tmp_buf + 31 * 8));
+  STORE_SH((loc1 - vec2), (tmp_buf + 23 * 8));
+  m4 = (loc1 + vec2);
+  STORE_SH((loc2 - vec1), (tmp_buf + 27 * 8));
+  m2 = (loc2 + vec1);
+  STORE_SH((loc3 - vec0), (tmp_buf + 19 * 8));
+  m6 = (loc3 + vec0);
+
+  /* Load 8 & Store 8 */
+  vec0 = LOAD_SH(tmp_odd_buf + 4 * 8);
+  vec1 = LOAD_SH(tmp_odd_buf + 13 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 10 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf + 2 * 8);
+  loc1 = LOAD_SH(tmp_eve_buf + 10 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 6 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 14 * 8);
+
+  m1 = (loc0 + vec3);
+  STORE_SH((loc0 - vec3), (tmp_buf + 29 * 8));
+  STORE_SH((loc1 - vec2), (tmp_buf + 21 * 8));
+  m5 = (loc1 + vec2);
+  STORE_SH((loc2 - vec1), (tmp_buf + 25 * 8));
+  m3 = (loc2 + vec1);
+  STORE_SH((loc3 - vec0), (tmp_buf + 17 * 8));
+  m7 = (loc3 + vec0);
+
+  /* Load 8 & Store 8 */
+  vec0 = LOAD_SH(tmp_odd_buf + 2 * 8);
+  vec1 = LOAD_SH(tmp_odd_buf + 11 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 12 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf + 1 * 8);
+  loc1 = LOAD_SH(tmp_eve_buf + 9 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 5 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 13 * 8);
+
+  n0 = (loc0 + vec3);
+  STORE_SH((loc0 - vec3), (tmp_buf + 30 * 8));
+  STORE_SH((loc1 - vec2), (tmp_buf + 22 * 8));
+  n4 = (loc1 + vec2);
+  STORE_SH((loc2 - vec1), (tmp_buf + 26 * 8));
+  n2 = (loc2 + vec1);
+  STORE_SH((loc3 - vec0), (tmp_buf + 18 * 8));
+  n6 = (loc3 + vec0);
+
+  /* Load 8 & Store 8 */
+  vec0 = LOAD_SH(tmp_odd_buf + 5 * 8);
+  vec1 = LOAD_SH(tmp_odd_buf + 15 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 8 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 1 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf + 3 * 8);
+  loc1 = LOAD_SH(tmp_eve_buf + 11 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 7 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 15 * 8);
+
+  n1 = (loc0 + vec3);
+  STORE_SH((loc0 - vec3), (tmp_buf + 28 * 8));
+  STORE_SH((loc1 - vec2), (tmp_buf + 20 * 8));
+  n5 = (loc1 + vec2);
+  STORE_SH((loc2 - vec1), (tmp_buf + 24 * 8));
+  n3 = (loc2 + vec1);
+  STORE_SH((loc3 - vec0), (tmp_buf + 16 * 8));
+  n7 = (loc3 + vec0);
+
+  /* Transpose : 16 vectors */
+  /* 1st & 2nd 8x8 */
+  TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                    m0, n0, m1, n1, m2, n2, m3, n3);
+  STORE_4VECS_SH((dest + 0), 32, m0, n0, m1, n1);
+  STORE_4VECS_SH((dest + 4 * 32), 32, m2, n2, m3, n3);
+
+  TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                    m4, n4, m5, n5, m6, n6, m7, n7);
+  STORE_4VECS_SH((dest + 8), 32, m4, n4, m5, n5);
+  STORE_4VECS_SH((dest + 8 + 4 * 32), 32, m6, n6, m7, n7);
+
+  /* 3rd & 4th 8x8 */
+  LOAD_8VECS_SH((tmp_buf + 8 * 16), 8, m0, n0, m1, n1, m2, n2, m3, n3);
+  LOAD_8VECS_SH((tmp_buf + 12 * 16), 8, m4, n4, m5, n5, m6, n6, m7, n7);
+  TRANSPOSE8x8_H_SH(m0, n0, m1, n1, m2, n2, m3, n3,
+                    m0, n0, m1, n1, m2, n2, m3, n3);
+  STORE_4VECS_SH((dest + 16), 32, m0, n0, m1, n1);
+  STORE_4VECS_SH((dest + 16 + 4 * 32), 32, m2, n2, m3, n3);
+
+  TRANSPOSE8x8_H_SH(m4, n4, m5, n5, m6, n6, m7, n7,
+                    m4, n4, m5, n5, m6, n6, m7, n7);
+  STORE_4VECS_SH((dest + 24), 32, m4, n4, m5, n5);
+  STORE_4VECS_SH((dest + 24 + 4 * 32), 32, m6, n6, m7, n7);
+}
+
+static void vp9_idct32x8_1d_rows_msa(const int16_t *input, int16_t *output) {
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  vp9_idct32x8_row_transpose_store(input, &tmp_buf[0]);
+
+  vp9_idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+
+  vp9_idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+
+  vp9_idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0],
+                                     &tmp_odd_buf[0], output);
+}
+
+static void vp9_idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                                   int16_t *tmp_eve_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+
+  /* Even stage 1 */
+  LOAD_8VECS_SH(tmp_buf, (4 * 32),
+                reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+
+  vec0 = reg1 - reg5;
+  vec1 = reg1 + reg5;
+  vec2 = reg7 - reg3;
+  vec3 = reg7 + reg3;
+
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+
+  vec0 = reg4 - reg6;
+  vec1 = reg4 + reg6;
+  vec2 = reg0 - reg2;
+  vec3 = reg0 + reg2;
+
+  stp4 = vec0 - loc0;
+  stp3 = vec0 + loc0;
+  stp7 = vec1 - loc1;
+  stp0 = vec1 + loc1;
+  stp5 = vec2 - loc2;
+  stp2 = vec2 + loc2;
+  stp6 = vec3 - loc3;
+  stp1 = vec3 + loc3;
+
+  /* Even stage 2 */
+  /* Load 8 */
+  LOAD_8VECS_SH((tmp_buf + 2 * 32), (4 * 32),
+                reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = reg0 + reg4;
+  reg0 = reg0 - reg4;
+  reg4 = reg6 + reg2;
+  reg6 = reg6 - reg2;
+  reg2 = reg1 + reg5;
+  reg1 = reg1 - reg5;
+  reg5 = reg7 + reg3;
+  reg7 = reg7 - reg3;
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = reg3 + reg4;
+  reg3 = reg3 - reg4;
+  reg4 = reg5 - vec1;
+  reg5 = reg5 + vec1;
+
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR((-reg6), reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = reg0 - reg6;
+  reg0 = reg0 + reg6;
+  vec1 = reg7 - reg1;
+  reg7 = reg7 + reg1;
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  /* Store 8 */
+  loc0 = stp0 - reg5;
+  loc1 = stp0 + reg5;
+  loc2 = stp1 - reg7;
+  loc3 = stp1 + reg7;
+  STORE_SH(loc0, (tmp_eve_buf + 15 * 8));
+  STORE_SH(loc1, (tmp_eve_buf));
+  STORE_SH(loc2, (tmp_eve_buf + 14 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 1 * 8));
+
+  loc0 = stp2 - reg1;
+  loc1 = stp2 + reg1;
+  loc2 = stp3 - reg4;
+  loc3 = stp3 + reg4;
+  STORE_SH(loc0, (tmp_eve_buf + 13 * 8));
+  STORE_SH(loc1, (tmp_eve_buf + 2 * 8));
+  STORE_SH(loc2, (tmp_eve_buf + 12 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 3 * 8));
+
+  /* Store 8 */
+  loc0 = stp4 - reg3;
+  loc1 = stp4 + reg3;
+  loc2 = stp5 - reg6;
+  loc3 = stp5 + reg6;
+  STORE_SH(loc0, (tmp_eve_buf + 11 * 8));
+  STORE_SH(loc1, (tmp_eve_buf + 4 * 8));
+  STORE_SH(loc2, (tmp_eve_buf + 10 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 5 * 8));
+
+  loc0 = stp6 - reg0;
+  loc1 = stp6 + reg0;
+  loc2 = stp7 - reg2;
+  loc3 = stp7 + reg2;
+  STORE_SH(loc0, (tmp_eve_buf + 9 * 8));
+  STORE_SH(loc1, (tmp_eve_buf + 6 * 8));
+  STORE_SH(loc2, (tmp_eve_buf + 8 * 8));
+  STORE_SH(loc3, (tmp_eve_buf + 7 * 8));
+}
+
+static void vp9_idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                                  int16_t *tmp_odd_buf) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  reg0 = LOAD_SH(tmp_buf + 32);
+  reg1 = LOAD_SH(tmp_buf + 7 * 32);
+  reg2 = LOAD_SH(tmp_buf + 9 * 32);
+  reg3 = LOAD_SH(tmp_buf + 15 * 32);
+  reg4 = LOAD_SH(tmp_buf + 17 * 32);
+  reg5 = LOAD_SH(tmp_buf + 23 * 32);
+  reg6 = LOAD_SH(tmp_buf + 25 * 32);
+  reg7 = LOAD_SH(tmp_buf + 31 * 32);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = reg0 + reg3;
+  reg0 = reg0 - reg3;
+  reg3 = reg7 + reg4;
+  reg7 = reg7 - reg4;
+  reg4 = reg1 + reg2;
+  reg1 = reg1 - reg2;
+  reg2 = reg6 + reg5;
+  reg6 = reg6 - reg5;
+  reg5 = vec0;
+
+  /* 4 Stores */
+  vec0 = reg5 + reg4;
+  vec1 = reg3 + reg2;
+  STORE_SH(vec0, (tmp_odd_buf + 4 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 5 * 8));
+
+  vec0 = reg5 - reg4;
+  vec1 = reg3 - reg2;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  STORE_SH(vec0, (tmp_odd_buf));
+  STORE_SH(vec1, (tmp_odd_buf + 1 * 8));
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+
+  vec0 = reg0 + reg1;
+  vec2 = reg7 - reg6;
+  vec1 = reg7 + reg6;
+  vec3 = reg0 - reg1;
+  STORE_SH(vec0, (tmp_odd_buf + 6 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 7 * 8));
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  STORE_SH(vec2, (tmp_odd_buf + 2 * 8));
+  STORE_SH(vec3, (tmp_odd_buf + 3 * 8));
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  reg0 = LOAD_SH(tmp_buf + 3 * 32);
+  reg1 = LOAD_SH(tmp_buf + 5 * 32);
+  reg2 = LOAD_SH(tmp_buf + 11 * 32);
+  reg3 = LOAD_SH(tmp_buf + 13 * 32);
+  reg4 = LOAD_SH(tmp_buf + 19 * 32);
+  reg5 = LOAD_SH(tmp_buf + 21 * 32);
+  reg6 = LOAD_SH(tmp_buf + 27 * 32);
+  reg7 = LOAD_SH(tmp_buf + 29 * 32);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  vec0 = reg1 - reg2;
+  vec1 = reg6 - reg5;
+  vec2 = reg0 - reg3;
+  vec3 = reg7 - reg4;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  vec2 = loc2 - loc0;
+  vec3 = loc3 - loc1;
+  vec0 = loc2 + loc0;
+  vec1 = loc3 + loc1;
+  STORE_SH(vec0, (tmp_odd_buf + 12 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 15 * 8));
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+
+  STORE_SH(vec0, (tmp_odd_buf + 10 * 8));
+  STORE_SH(vec1, (tmp_odd_buf + 11 * 8));
+
+  /* 4 Stores */
+  vec0 = reg0 + reg3;
+  vec1 = reg1 + reg2;
+  vec2 = reg6 + reg5;
+  vec3 = reg7 + reg4;
+  reg0 = vec0 + vec1;
+  reg1 = vec3 + vec2;
+  reg2 = vec0 - vec1;
+  reg3 = vec3 - vec2;
+  STORE_SH(reg0, (tmp_odd_buf + 13 * 8));
+  STORE_SH(reg1, (tmp_odd_buf + 14 * 8));
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+
+  STORE_SH(reg0, (tmp_odd_buf + 8 * 8));
+  STORE_SH(reg1, (tmp_odd_buf + 9 * 8));
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+  /* Load 8 & Store 8 */
+  reg0 = LOAD_SH(tmp_odd_buf);
+  reg1 = LOAD_SH(tmp_odd_buf + 1 * 8);
+  reg2 = LOAD_SH(tmp_odd_buf + 2 * 8);
+  reg3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+  reg4 = LOAD_SH(tmp_odd_buf + 8 * 8);
+  reg5 = LOAD_SH(tmp_odd_buf + 9 * 8);
+  reg6 = LOAD_SH(tmp_odd_buf + 10 * 8);
+  reg7 = LOAD_SH(tmp_odd_buf + 11 * 8);
+
+  loc0 = reg0 + reg4;
+  loc1 = reg1 + reg5;
+  loc2 = reg2 + reg6;
+  loc3 = reg3 + reg7;
+  STORE_SH(loc0, (tmp_odd_buf));
+  STORE_SH(loc1, (tmp_odd_buf + 1 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 2 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 3 * 8));
+
+  vec0 = reg0 - reg4;
+  vec1 = reg1 - reg5;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  vec0 = reg2 - reg6;
+  vec1 = reg3 - reg7;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  STORE_SH(loc0, (tmp_odd_buf + 8 * 8));
+  STORE_SH(loc1, (tmp_odd_buf + 9 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 10 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 11 * 8));
+
+  /* Load 8 & Store 8 */
+  reg1 = LOAD_SH(tmp_odd_buf + 4 * 8);
+  reg2 = LOAD_SH(tmp_odd_buf + 5 * 8);
+  reg0 = LOAD_SH(tmp_odd_buf + 6 * 8);
+  reg3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+  reg4 = LOAD_SH(tmp_odd_buf + 12 * 8);
+  reg5 = LOAD_SH(tmp_odd_buf + 13 * 8);
+  reg6 = LOAD_SH(tmp_odd_buf + 14 * 8);
+  reg7 = LOAD_SH(tmp_odd_buf + 15 * 8);
+
+  loc0 = reg0 + reg4;
+  loc1 = reg1 + reg5;
+  loc2 = reg2 + reg6;
+  loc3 = reg3 + reg7;
+  STORE_SH(loc0, (tmp_odd_buf + 4 * 8));
+  STORE_SH(loc1, (tmp_odd_buf + 5 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 6 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 7 * 8));
+
+  vec0 = reg0 - reg4;
+  vec1 = reg3 - reg7;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  vec0 = reg1 - reg5;
+  vec1 = reg2 - reg6;
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  STORE_SH(loc0, (tmp_odd_buf + 12 * 8));
+  STORE_SH(loc1, (tmp_odd_buf + 13 * 8));
+  STORE_SH(loc2, (tmp_odd_buf + 14 * 8));
+  STORE_SH(loc3, (tmp_odd_buf + 15 * 8));
+}
+
+static void vp9_idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                                 int16_t *tmp_odd_buf,
+                                                 uint8_t *dest,
+                                                 int32_t dest_stride) {
+  v8i16 vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  v8i16 m0, m1, m2, m3, m4, m5, m6, m7;
+  v8i16 n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  vec0 = LOAD_SH(tmp_odd_buf);
+  vec1 = LOAD_SH(tmp_odd_buf + 9 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 14 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 6 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf);
+  loc1 = LOAD_SH(tmp_eve_buf + 8 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 4 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 12 * 8);
+
+  m0 = (loc0 + vec3);
+  m4 = (loc1 + vec2);
+  m2 = (loc2 + vec1);
+  m6 = (loc3 + vec0);
+  SRARI_H_4VECS_SH(m0, m2, m4, m6, m0, m2, m4, m6, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS(dest, dest_stride, m0, m2, m4, m6);
+
+  m6 = (loc0 - vec3);
+  m2 = (loc1 - vec2);
+  m4 = (loc2 - vec1);
+  m0 = (loc3 - vec0);
+  SRARI_H_4VECS_SH(m0, m2, m4, m6, m0, m2, m4, m6, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 19 * dest_stride),
+                                        dest_stride, m0, m2, m4, m6);
+
+  /* Load 8 & Store 8 */
+  vec0 = LOAD_SH(tmp_odd_buf + 4 * 8);
+  vec1 = LOAD_SH(tmp_odd_buf + 13 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 10 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 3 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf + 2 * 8);
+  loc1 = LOAD_SH(tmp_eve_buf + 10 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 6 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 14 * 8);
+
+  m1 = (loc0 + vec3);
+  m5 = (loc1 + vec2);
+  m3 = (loc2 + vec1);
+  m7 = (loc3 + vec0);
+  SRARI_H_4VECS_SH(m1, m3, m5, m7, m1, m3, m5, m7, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 2 * dest_stride),
+                                        dest_stride, m1, m3, m5, m7);
+
+  m7 = (loc0 - vec3);
+  m3 = (loc1 - vec2);
+  m5 = (loc2 - vec1);
+  m1 = (loc3 - vec0);
+  SRARI_H_4VECS_SH(m1, m3, m5, m7, m1, m3, m5, m7, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 17 * dest_stride),
+                                        dest_stride, m1, m3, m5, m7);
+
+  /* Load 8 & Store 8 */
+  vec0 = LOAD_SH(tmp_odd_buf + 2 * 8);
+  vec1 = LOAD_SH(tmp_odd_buf + 11 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 12 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 7 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf + 1 * 8);
+  loc1 = LOAD_SH(tmp_eve_buf + 9 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 5 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 13 * 8);
+
+  n0 = (loc0 + vec3);
+  n4 = (loc1 + vec2);
+  n2 = (loc2 + vec1);
+  n6 = (loc3 + vec0);
+  SRARI_H_4VECS_SH(n0, n2, n4, n6, n0, n2, n4, n6, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 1 * dest_stride),
+                                        dest_stride, n0, n2, n4, n6);
+
+  n6 = (loc0 - vec3);
+  n2 = (loc1 - vec2);
+  n4 = (loc2 - vec1);
+  n0 = (loc3 - vec0);
+  SRARI_H_4VECS_SH(n0, n2, n4, n6, n0, n2, n4, n6, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 18 * dest_stride),
+                                        dest_stride, n0, n2, n4, n6);
+
+  /* Load 8 & Store 8 */
+  vec0 = LOAD_SH(tmp_odd_buf + 5 * 8);
+  vec1 = LOAD_SH(tmp_odd_buf + 15 * 8);
+  vec2 = LOAD_SH(tmp_odd_buf + 8 * 8);
+  vec3 = LOAD_SH(tmp_odd_buf + 1 * 8);
+  loc0 = LOAD_SH(tmp_eve_buf + 3 * 8);
+  loc1 = LOAD_SH(tmp_eve_buf + 11 * 8);
+  loc2 = LOAD_SH(tmp_eve_buf + 7 * 8);
+  loc3 = LOAD_SH(tmp_eve_buf + 15 * 8);
+
+  n1 = (loc0 + vec3);
+  n5 = (loc1 + vec2);
+  n3 = (loc2 + vec1);
+  n7 = (loc3 + vec0);
+  SRARI_H_4VECS_SH(n1, n3, n5, n7, n1, n3, n5, n7, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 3 * dest_stride),
+                                        dest_stride, n1, n3, n5, n7);
+
+  n7 = (loc0 - vec3);
+  n3 = (loc1 - vec2);
+  n5 = (loc2 - vec1);
+  n1 = (loc3 - vec0);
+  SRARI_H_4VECS_SH(n1, n3, n5, n7, n1, n3, n5, n7, 6);
+  VP9_ADDBLK_CLIP_AND_STORE_OFF_4H_VECS((dest + 16 * dest_stride),
+                                        dest_stride, n1, n3, n5, n7);
+}
+
+static void vp9_idct8x32_1d_columns_addblk_msa(int16_t *input, uint8_t *dest,
+                                               int32_t dest_stride) {
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  vp9_idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+
+  vp9_idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+
+  vp9_idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0],
+                                       dest, dest_stride);
+}
+
+void vp9_idct32x32_1024_add_msa(const int16_t *input, uint8_t *dest,
+                                int32_t dest_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 4; ++i) {
+    /* process 32 * 8 block */
+    vp9_idct32x8_1d_rows_msa((input + (i << 8)), (out_ptr + (i << 8)));
+  }
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dest + (i << 3)),
+                                       dest_stride);
+  }
+}
+
+void vp9_idct32x32_34_add_msa(const int16_t *input, uint8_t *dest,
+                              int32_t dest_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  for (i = 32; i--;) {
+    __asm__ __volatile__ (
+        "sw     $zero,      0(%[out_ptr])     \n\t"
+        "sw     $zero,      4(%[out_ptr])     \n\t"
+        "sw     $zero,      8(%[out_ptr])     \n\t"
+        "sw     $zero,     12(%[out_ptr])     \n\t"
+        "sw     $zero,     16(%[out_ptr])     \n\t"
+        "sw     $zero,     20(%[out_ptr])     \n\t"
+        "sw     $zero,     24(%[out_ptr])     \n\t"
+        "sw     $zero,     28(%[out_ptr])     \n\t"
+        "sw     $zero,     32(%[out_ptr])     \n\t"
+        "sw     $zero,     36(%[out_ptr])     \n\t"
+        "sw     $zero,     40(%[out_ptr])     \n\t"
+        "sw     $zero,     44(%[out_ptr])     \n\t"
+        "sw     $zero,     48(%[out_ptr])     \n\t"
+        "sw     $zero,     52(%[out_ptr])     \n\t"
+        "sw     $zero,     56(%[out_ptr])     \n\t"
+        "sw     $zero,     60(%[out_ptr])     \n\t"
+
+        :
+        : [out_ptr] "r" (out_ptr)
+    );
+
+    out_ptr += 32;
+  }
+
+  out_ptr = out_arr;
+
+  /* rows: only upper-left 8x8 has non-zero coeff */
+  vp9_idct32x8_1d_rows_msa(input, out_ptr);
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    vp9_idct8x32_1d_columns_addblk_msa((out_ptr + (i << 3)), (dest + (i << 3)),
+                                       dest_stride);
+  }
+}
+
+void vp9_idct32x32_1_add_msa(const int16_t *input, uint8_t *dest,
+                             int32_t dest_stride) {
+  int32_t i, const1;
+  v8i16 const2;
+  int16_t out;
+  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+  v16u8 dest0, dest1, dest2, dest3;
+  v16u8 tmp0, tmp1, tmp2, tmp3;
+  v16i8 zero = { 0 };
+
+  out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  const1 = ROUND_POWER_OF_TWO(out, 6);
+
+  const2 = __msa_fill_h(const1);
+
+  for (i = 0; i < 16; ++i) {
+    dest0 = LOAD_UB(dest);
+    dest1 = LOAD_UB(dest + 16);
+    dest2 = LOAD_UB(dest + dest_stride);
+    dest3 = LOAD_UB(dest + dest_stride + 16);
+
+    res0 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest0);
+    res1 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest1);
+    res2 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest2);
+    res3 = (v8i16)__msa_ilvr_b(zero, (v16i8)dest3);
+    res4 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest0);
+    res5 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest1);
+    res6 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest2);
+    res7 = (v8i16)__msa_ilvl_b(zero, (v16i8)dest3);
+
+    res0 += const2;
+    res1 += const2;
+    res2 += const2;
+    res3 += const2;
+    res4 += const2;
+    res5 += const2;
+    res6 += const2;
+    res7 += const2;
+
+    res0 = CLIP_UNSIGNED_CHAR_H(res0);
+    res1 = CLIP_UNSIGNED_CHAR_H(res1);
+    res2 = CLIP_UNSIGNED_CHAR_H(res2);
+    res3 = CLIP_UNSIGNED_CHAR_H(res3);
+    res4 = CLIP_UNSIGNED_CHAR_H(res4);
+    res5 = CLIP_UNSIGNED_CHAR_H(res5);
+    res6 = CLIP_UNSIGNED_CHAR_H(res6);
+    res7 = CLIP_UNSIGNED_CHAR_H(res7);
+
+    tmp0 = (v16u8)__msa_pckev_b((v16i8)res4, (v16i8)res0);
+    tmp1 = (v16u8)__msa_pckev_b((v16i8)res5, (v16i8)res1);
+    tmp2 = (v16u8)__msa_pckev_b((v16i8)res6, (v16i8)res2);
+    tmp3 = (v16u8)__msa_pckev_b((v16i8)res7, (v16i8)res3);
+
+    STORE_UB(tmp0, dest);
+    STORE_UB(tmp1, dest + 16);
+    dest += dest_stride;
+    STORE_UB(tmp2, dest);
+    STORE_UB(tmp3, dest + 16);
+    dest += dest_stride;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h
new file mode 100644
index 00000000000..d7aabbb8898
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/msa/vp9_macros_msa.h
@@ -0,0 +1,867 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
+#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_MSA
+/* load macros */
+#define LOAD_UB(psrc) *((const v16u8 *)(psrc))
+#define LOAD_SB(psrc) *((const v16i8 *)(psrc))
+#define LOAD_UH(psrc) *((const v8u16 *)(psrc))
+#define LOAD_SH(psrc) *((const v8i16 *)(psrc))
+#define LOAD_UW(psrc) *((const v4u32 *)(psrc))
+#define LOAD_SW(psrc) *((const v4i32 *)(psrc))
+#define LOAD_UD(psrc) *((const v2u64 *)(psrc))
+#define LOAD_SD(psrc) *((const v2i64 *)(psrc))
+
+/* store macros */
+#define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec)
+#define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec)
+#define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec)
+#define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec)
+#define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec)
+#define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec)
+#define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec)
+#define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec)
+
+#if (__mips_isa_rev >= 6)
+#define LOAD_WORD(psrc) ({                         \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                  \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "lw  %[val_m],  %[src_m]  \n\t"              \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+
+#if (__mips == 64)
+#define LOAD_DWORD(psrc) ({                        \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                              \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "ld  %[val_m],  %[src_m]  \n\t"              \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+#else  // !(__mips == 64)
+#define LOAD_DWORD(psrc) ({                                      \
+  const uint8_t *src1_m = (const uint8_t *)(psrc);               \
+  const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4;         \
+  uint32_t val0_m, val1_m;                                       \
+  uint64_t genval_m = 0;                                         \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "lw  %[val0_m],  %[src1_m]  \n\t"                          \
+                                                                 \
+      : [val0_m] "=r" (val0_m)                                   \
+      : [src1_m] "m" (*src1_m)                                   \
+  );                                                             \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "lw  %[val1_m],  %[src2_m]  \n\t"                          \
+                                                                 \
+      : [val1_m] "=r" (val1_m)                                   \
+      : [src2_m] "m" (*src2_m)                                   \
+  );                                                             \
+                                                                 \
+  genval_m = (uint64_t)(val1_m);                                 \
+  genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000);  \
+  genval_m = (uint64_t)(genval_m | (uint64_t)val0_m);            \
+                                                                 \
+  genval_m;                                                      \
+})
+#endif  // (__mips == 64)
+#define STORE_WORD_WITH_OFFSET_1(pdst, val) {    \
+  uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1;  \
+  const uint32_t val_m = (val);                  \
+                                                 \
+  __asm__ __volatile__ (                         \
+      "sw  %[val_m],  %[dst_ptr_m]  \n\t"        \
+                                                 \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)            \
+      : [val_m] "r" (val_m)                      \
+  );                                             \
+}
+
+#define STORE_WORD(pdst, val) {            \
+  uint8_t *dst_ptr_m = (uint8_t *)(pdst);  \
+  const uint32_t val_m = (val);            \
+                                           \
+  __asm__ __volatile__ (                   \
+      "sw  %[val_m],  %[dst_ptr_m]  \n\t"  \
+                                           \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)      \
+      : [val_m] "r" (val_m)                \
+  );                                       \
+}
+
+#define STORE_DWORD(pdst, val) {           \
+  uint8_t *dst_ptr_m = (uint8_t *)(pdst);  \
+  const uint64_t val_m = (val);            \
+                                           \
+  __asm__ __volatile__ (                   \
+      "sd  %[val_m],  %[dst_ptr_m]  \n\t"  \
+                                           \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)      \
+      : [val_m] "r" (val_m)                \
+  );                                       \
+}
+#else  // !(__mips_isa_rev >= 6)
+#define LOAD_WORD(psrc) ({                         \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                  \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "ulw  %[val_m],  %[src_m]  \n\t"             \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+
+#if (__mips == 64)
+#define LOAD_DWORD(psrc) ({                        \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                              \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "uld  %[val_m],  %[src_m]  \n\t"             \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+#else  // !(__mips == 64)
+#define LOAD_DWORD(psrc) ({                                      \
+  const uint8_t *src1_m = (const uint8_t *)(psrc);               \
+  const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4;         \
+  uint32_t val0_m, val1_m;                                       \
+  uint64_t genval_m = 0;                                         \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "ulw  %[val0_m],  %[src1_m]  \n\t"                         \
+                                                                 \
+      : [val0_m] "=r" (val0_m)                                   \
+      : [src1_m] "m" (*src1_m)                                   \
+  );                                                             \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "ulw  %[val1_m],  %[src2_m]  \n\t"                         \
+                                                                 \
+      : [val1_m] "=r" (val1_m)                                   \
+      : [src2_m] "m" (*src2_m)                                   \
+  );                                                             \
+                                                                 \
+  genval_m = (uint64_t)(val1_m);                                 \
+  genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000);  \
+  genval_m = (uint64_t)(genval_m | (uint64_t)val0_m);            \
+                                                                 \
+  genval_m;                                                      \
+})
+#endif  // (__mips == 64)
+
+#define STORE_WORD_WITH_OFFSET_1(pdst, val) {    \
+  uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1;  \
+  const uint32_t val_m = (val);                  \
+                                                 \
+  __asm__ __volatile__ (                         \
+      "usw  %[val_m],  %[dst_ptr_m]  \n\t"       \
+                                                 \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)            \
+      : [val_m] "r" (val_m)                      \
+  );                                             \
+}
+
+#define STORE_WORD(pdst, val) {             \
+  uint8_t *dst_ptr_m = (uint8_t *)(pdst);   \
+  const uint32_t val_m = (val);             \
+                                            \
+  __asm__ __volatile__ (                    \
+      "usw  %[val_m],  %[dst_ptr_m]  \n\t"  \
+                                            \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)       \
+      : [val_m] "r" (val_m)                 \
+  );                                        \
+}
+
+#define STORE_DWORD(pdst, val) {                            \
+  uint8_t *dst1_m = (uint8_t *)(pdst);                      \
+  uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4;                \
+  uint32_t val0_m, val1_m;                                  \
+                                                            \
+  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                            \
+  __asm__ __volatile__ (                                    \
+      "usw  %[val0_m],  %[dst1_m]  \n\t"                    \
+      "usw  %[val1_m],  %[dst2_m]  \n\t"                    \
+                                                            \
+      : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m)    \
+      : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m)        \
+  );                                                        \
+}
+#endif  // (__mips_isa_rev >= 6)
+
+#define LOAD_2VECS_UB(psrc, stride,   \
+                      val0, val1) {   \
+  val0 = LOAD_UB(psrc + 0 * stride);  \
+  val1 = LOAD_UB(psrc + 1 * stride);  \
+}
+
+#define LOAD_4VECS_UB(psrc, stride,              \
+                      val0, val1, val2, val3) {  \
+  val0 = LOAD_UB(psrc + 0 * stride);             \
+  val1 = LOAD_UB(psrc + 1 * stride);             \
+  val2 = LOAD_UB(psrc + 2 * stride);             \
+  val3 = LOAD_UB(psrc + 3 * stride);             \
+}
+
+#define LOAD_4VECS_SB(psrc, stride,              \
+                      val0, val1, val2, val3) {  \
+  val0 = LOAD_SB(psrc + 0 * stride);             \
+  val1 = LOAD_SB(psrc + 1 * stride);             \
+  val2 = LOAD_SB(psrc + 2 * stride);             \
+  val3 = LOAD_SB(psrc + 3 * stride);             \
+}
+
+#define LOAD_5VECS_UB(psrc, stride,                    \
+                      out0, out1, out2, out3, out4) {  \
+  LOAD_4VECS_UB((psrc), (stride),                      \
+                (out0), (out1), (out2), (out3));       \
+  out4 = LOAD_UB(psrc + 4 * stride);                   \
+}
+
+#define LOAD_5VECS_SB(psrc, stride,                    \
+                      out0, out1, out2, out3, out4) {  \
+  LOAD_4VECS_SB((psrc), (stride),                      \
+                (out0), (out1), (out2), (out3));       \
+  out4 = LOAD_SB(psrc + 4 * stride);                   \
+}
+
+#define LOAD_7VECS_SB(psrc, stride,            \
+                      val0, val1, val2, val3,  \
+                      val4, val5, val6) {      \
+  val0 = LOAD_SB((psrc) + 0 * (stride));       \
+  val1 = LOAD_SB((psrc) + 1 * (stride));       \
+  val2 = LOAD_SB((psrc) + 2 * (stride));       \
+  val3 = LOAD_SB((psrc) + 3 * (stride));       \
+  val4 = LOAD_SB((psrc) + 4 * (stride));       \
+  val5 = LOAD_SB((psrc) + 5 * (stride));       \
+  val6 = LOAD_SB((psrc) + 6 * (stride));       \
+}
+
+#define LOAD_8VECS_UB(psrc, stride,               \
+                      out0, out1, out2, out3,     \
+                      out4, out5, out6, out7) {   \
+  LOAD_4VECS_UB((psrc), (stride),                 \
+                (out0), (out1), (out2), (out3));  \
+  LOAD_4VECS_UB((psrc + 4 * stride), (stride),    \
+                (out4), (out5), (out6), (out7));  \
+}
+
+#define LOAD_8VECS_SB(psrc, stride,               \
+                      out0, out1, out2, out3,     \
+                      out4, out5, out6, out7) {   \
+  LOAD_4VECS_SB((psrc), (stride),                 \
+                (out0), (out1), (out2), (out3));  \
+  LOAD_4VECS_SB((psrc + 4 * stride), (stride),    \
+                (out4), (out5), (out6), (out7));  \
+}
+
+#define LOAD_2VECS_SH(psrc, stride,       \
+                      val0, val1) {       \
+  val0 = LOAD_SH((psrc) + 0 * (stride));  \
+  val1 = LOAD_SH((psrc) + 1 * (stride));  \
+}
+
+#define LOAD_4VECS_SH(psrc, stride,                          \
+                      val0, val1, val2, val3) {              \
+  LOAD_2VECS_SH((psrc), (stride), val0, val1);               \
+  LOAD_2VECS_SH((psrc + 2 * stride), (stride), val2, val3);  \
+}
+
+#define LOAD_8VECS_SH(psrc, stride,              \
+                      val0, val1, val2, val3,    \
+                      val4, val5, val6, val7) {  \
+  LOAD_4VECS_SH((psrc), (stride),                \
+                val0, val1, val2, val3);         \
+  LOAD_4VECS_SH((psrc + 4 * stride), (stride),   \
+                val4, val5, val6, val7);         \
+}
+
+#define LOAD_16VECS_SH(psrc, stride,                  \
+                       val0, val1, val2, val3,        \
+                       val4, val5, val6, val7,        \
+                       val8, val9, val10, val11,      \
+                       val12, val13, val14, val15) {  \
+  LOAD_8VECS_SH((psrc), (stride),                     \
+                val0, val1, val2, val3,               \
+                val4, val5, val6, val7);              \
+  LOAD_8VECS_SH((psrc + 8 * (stride)), (stride),      \
+                val8, val9, val10, val11,             \
+                val12, val13, val14, val15);          \
+}
+
+#define STORE_4VECS_UB(dst_out, pitch,         \
+                       in0, in1, in2, in3) {   \
+  STORE_UB((in0), (dst_out));                  \
+  STORE_UB((in1), ((dst_out) + (pitch)));      \
+  STORE_UB((in2), ((dst_out) + 2 * (pitch)));  \
+  STORE_UB((in3), ((dst_out) + 3 * (pitch)));  \
+}
+
+#define STORE_8VECS_UB(dst_out, pitch_in,               \
+                       in0, in1, in2, in3,              \
+                       in4, in5, in6, in7) {            \
+  STORE_4VECS_UB(dst_out, pitch_in,                     \
+                 in0, in1, in2, in3);                   \
+  STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in,  \
+                 in4, in5, in6, in7);                   \
+}
+
+#define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) {  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0));  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1));  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2));  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3));  \
+}
+
+#define VEC_INSERT_2DW_UB(src, src0, src1) {             \
+  src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0));  \
+  src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1));  \
+}
+
+#define STORE_4VECS_SH(ptr, stride,           \
+                       in0, in1, in2, in3) {  \
+  STORE_SH(in0, ((ptr) + 0 * stride));        \
+  STORE_SH(in1, ((ptr) + 1 * stride));        \
+  STORE_SH(in2, ((ptr) + 2 * stride));        \
+  STORE_SH(in3, ((ptr) + 3 * stride));        \
+}
+
+#define STORE_8VECS_SH(ptr, stride,           \
+                       in0, in1, in2, in3,    \
+                       in4, in5, in6, in7) {  \
+  STORE_SH(in0, ((ptr) + 0 * stride));        \
+  STORE_SH(in1, ((ptr) + 1 * stride));        \
+  STORE_SH(in2, ((ptr) + 2 * stride));        \
+  STORE_SH(in3, ((ptr) + 3 * stride));        \
+  STORE_SH(in4, ((ptr) + 4 * stride));        \
+  STORE_SH(in5, ((ptr) + 5 * stride));        \
+  STORE_SH(in6, ((ptr) + 6 * stride));        \
+  STORE_SH(in7, ((ptr) + 7 * stride));        \
+}
+
+#define CLIP_UNSIGNED_CHAR_H(in) ({                   \
+  v8i16 max_m = __msa_ldi_h(255);                     \
+  v8i16 out_m;                                        \
+                                                      \
+  out_m = __msa_maxi_s_h((v8i16)(in), 0);             \
+  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
+  out_m;                                              \
+})
+
+/* halfword 8x8 transpose macro */
+#define TRANSPOSE8x8_H_SH(in0, in1, in2, in3,                 \
+                          in4, in5, in6, in7,                 \
+                          out0, out1, out2, out3,             \
+                          out4, out5, out6, out7) {           \
+  v8i16 s0_m, s1_m;                                           \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                       \
+  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                       \
+                                                              \
+  s0_m = __msa_ilvr_h((v8i16)(in6), (v8i16)(in4));            \
+  s1_m = __msa_ilvr_h((v8i16)(in7), (v8i16)(in5));            \
+  tmp0_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
+  tmp1_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
+                                                              \
+  s0_m = __msa_ilvl_h((v8i16)(in6), (v8i16)(in4));            \
+  s1_m = __msa_ilvl_h((v8i16)(in7), (v8i16)(in5));            \
+  tmp2_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
+  tmp3_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
+                                                              \
+  s0_m = __msa_ilvr_h((v8i16)(in2), (v8i16)(in0));            \
+  s1_m = __msa_ilvr_h((v8i16)(in3), (v8i16)(in1));            \
+  tmp4_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
+  tmp5_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
+                                                              \
+  s0_m = __msa_ilvl_h((v8i16)(in2), (v8i16)(in0));            \
+  s1_m = __msa_ilvl_h((v8i16)(in3), (v8i16)(in1));            \
+  tmp6_m = __msa_ilvr_h((v8i16)s1_m, (v8i16)s0_m);            \
+  tmp7_m = __msa_ilvl_h((v8i16)s1_m, (v8i16)s0_m);            \
+                                                              \
+  out0 = (v8i16)__msa_pckev_d((v2i64)tmp0_m, (v2i64)tmp4_m);  \
+  out1 = (v8i16)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);  \
+  out2 = (v8i16)__msa_pckev_d((v2i64)tmp1_m, (v2i64)tmp5_m);  \
+  out3 = (v8i16)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);  \
+  out4 = (v8i16)__msa_pckev_d((v2i64)tmp2_m, (v2i64)tmp6_m);  \
+  out5 = (v8i16)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);  \
+  out6 = (v8i16)__msa_pckev_d((v2i64)tmp3_m, (v2i64)tmp7_m);  \
+  out7 = (v8i16)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);  \
+}
+
+/* interleave macros */
+/* no in-place support */
+#define ILV_B_LRLR_UB(in0, in1, in2, in3,                  \
+                      out0, out1, out2, out3) {            \
+  out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0));  \
+  out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0));  \
+  out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2));  \
+  out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2));  \
+}
+
+#define ILV_H_LRLR_SH(in0, in1, in2, in3,           \
+                      out0, out1, out2, out3) {     \
+  out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0));  \
+  out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0));  \
+  out2 = __msa_ilvl_h((v8i16)(in3), (v8i16)(in2));  \
+  out3 = __msa_ilvr_h((v8i16)(in3), (v8i16)(in2));  \
+}
+
+#define ILV_H_LR_SH(in0, in1, out0, out1) {         \
+  out0 = __msa_ilvl_h((v8i16)(in1), (v8i16)(in0));  \
+  out1 = __msa_ilvr_h((v8i16)(in1), (v8i16)(in0));  \
+}
+
+#define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l,            \
+                        out0, out1) {                          \
+  out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r));  \
+  out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r));  \
+}
+
+#define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,     \
+                        out0, out1) {                   \
+  out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r));  \
+  out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r));  \
+}
+
+#define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        out0, out1, out2, out3) {    \
+  ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+}
+
+#define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        out0, out1, out2, out3) {    \
+  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+}
+
+#define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r,   \
+                        in3_r, in4_r, in5_r,   \
+                        in0_l, in1_l, in2_l,   \
+                        in3_l, in4_l, in5_l,   \
+                        out0, out1, out2,      \
+                        out3, out4, out5) {    \
+  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,  \
+                  out0, out1);                 \
+  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,  \
+                  out2, out3);                 \
+  ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,  \
+                  out4, out5);                 \
+}
+
+#define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
+                        in4_r, in5_r, in6_r, in7_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        in4_l, in5_l, in6_l, in7_l,  \
+                        out0, out1, out2, out3,      \
+                        out4, out5, out6, out7) {    \
+  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+  ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,        \
+                  out4, out5);                       \
+  ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l,        \
+                  out6, out7);                       \
+}
+
+#define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,     \
+                        out0, out1) {                   \
+  out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r));  \
+  out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r));  \
+}
+
+#define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        out0, out1, out2, out3) {    \
+  ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+}
+
+#define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r,   \
+                        in3_r, in4_r, in5_r,   \
+                        in0_l, in1_l, in2_l,   \
+                        in3_l, in4_l, in5_l,   \
+                        out0, out1, out2,      \
+                        out3, out4, out5) {    \
+  ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,  \
+                  out0, out1);                 \
+  ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,  \
+                  out2, out3);                 \
+  ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,  \
+                  out4, out5);                 \
+}
+
+#define ILVR_D_2VECS_SB(out0, in0_l, in0_r,                    \
+                        out1, in1_l, in1_r) {                  \
+  out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r));  \
+  out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r));  \
+}
+
+#define ILVR_D_3VECS_SB(out0, in0_l, in0_r,                    \
+                        out1, in1_l, in1_r,                    \
+                        out2, in2_l, in2_r) {                  \
+  ILVR_D_2VECS_SB(out0, in0_l, in0_r,                          \
+                  out1, in1_l, in1_r);                         \
+  out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r));  \
+}
+
+#define ILVR_D_4VECS_SB(out0, in0_l, in0_r,    \
+                        out1, in1_l, in1_r,    \
+                        out2, in2_l, in2_r,    \
+                        out3, in3_l, in3_r) {  \
+  ILVR_D_2VECS_SB(out0, in0_l, in0_r,          \
+                  out1, in1_l, in1_r);         \
+  ILVR_D_2VECS_SB(out2, in2_l, in2_r,          \
+                  out3, in3_l, in3_r);         \
+}
+
+#define DOTP_S_W_4VECS_SW(m0, c0, m1, c1,            \
+                          m2, c2, m3, c3,            \
+                          out0, out1, out2, out3) {  \
+  out0 = __msa_dotp_s_w((v8i16)(m0), (v8i16)(c0));   \
+  out1 = __msa_dotp_s_w((v8i16)(m1), (v8i16)(c1));   \
+  out2 = __msa_dotp_s_w((v8i16)(m2), (v8i16)(c2));   \
+  out3 = __msa_dotp_s_w((v8i16)(m3), (v8i16)(c3));   \
+}
+
+#define PCKEV_H_2VECS_SH(in0_l, in0_r, in1_l, in1_r,     \
+                         out0, out1) {                   \
+  out0 = __msa_pckev_h((v8i16)(in0_l), (v8i16)(in0_r));  \
+  out1 = __msa_pckev_h((v8i16)(in1_l), (v8i16)(in1_r));  \
+}
+
+#define XORI_B_2VECS_UB(val0, val1,               \
+                        out0, out1, xor_val) {    \
+  out0 = __msa_xori_b((v16u8)(val0), (xor_val));  \
+  out1 = __msa_xori_b((v16u8)(val1), (xor_val));  \
+}
+
+#define XORI_B_2VECS_SB(val0, val1,                      \
+                        out0, out1, xor_val) {           \
+  out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val));  \
+  out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val));  \
+}
+
+#define XORI_B_3VECS_SB(val0, val1, val2,                \
+                        out0, out1, out2, xor_val) {     \
+  XORI_B_2VECS_SB(val0, val1,  out0, out1, xor_val);     \
+  out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val));  \
+}
+
+#define XORI_B_4VECS_UB(val0, val1, val2, val3,      \
+                        out0, out1, out2, out3,      \
+                        xor_val) {                   \
+  XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val);  \
+  XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val);  \
+}
+
+#define XORI_B_4VECS_SB(val0, val1, val2, val3,      \
+                        out0, out1, out2, out3,      \
+                        xor_val) {                   \
+  XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val);  \
+  XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val);  \
+}
+
+#define XORI_B_7VECS_SB(val0, val1, val2, val3,      \
+                        val4, val5, val6,            \
+                        out0, out1, out2, out3,      \
+                        out4, out5, out6,            \
+                        xor_val) {                   \
+  XORI_B_4VECS_SB(val0, val1, val2, val3,            \
+                  out0, out1, out2, out3, xor_val);  \
+  XORI_B_3VECS_SB(val4, val5, val6,                  \
+                  out4, out5, out6, xor_val);        \
+}
+
+#define SRARI_H_4VECS_UH(val0, val1, val2, val3,                  \
+                         out0, out1, out2, out3,                  \
+                         shift_right_val) {                       \
+  out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val));  \
+  out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val));  \
+  out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val));  \
+  out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val));  \
+}
+
+#define SRARI_H_4VECS_SH(val0, val1, val2, val3,           \
+                         out0, out1, out2, out3,           \
+                         shift_right_val) {                \
+  out0 = __msa_srari_h((v8i16)(val0), (shift_right_val));  \
+  out1 = __msa_srari_h((v8i16)(val1), (shift_right_val));  \
+  out2 = __msa_srari_h((v8i16)(val2), (shift_right_val));  \
+  out3 = __msa_srari_h((v8i16)(val3), (shift_right_val));  \
+}
+
+#define SRARI_W_4VECS_SW(val0, val1, val2, val3,           \
+                         out0, out1, out2, out3,           \
+                         shift_right_val) {                \
+  out0 = __msa_srari_w((v4i32)(val0), (shift_right_val));  \
+  out1 = __msa_srari_w((v4i32)(val1), (shift_right_val));  \
+  out2 = __msa_srari_w((v4i32)(val2), (shift_right_val));  \
+  out3 = __msa_srari_w((v4i32)(val3), (shift_right_val));  \
+}
+
+#define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({  \
+  v8u16 out_m;                                                         \
+                                                                       \
+  out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val));     \
+  out_m = __msa_sat_u_h(out_m, (sat_val));                             \
+  out_m;                                                               \
+})
+
+#define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({  \
+  v8i16 out_m;                                                       \
+                                                                     \
+  out_m = __msa_srari_h((v8i16)(input), (right_shift_val));          \
+  out_m = __msa_sat_s_h(out_m, (sat_val));                           \
+  out_m;                                                             \
+})
+
+#define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2,        \
+                                         pdst, stride) {  \
+  uint32_t out0_m, out1_m, out2_m, out3_m;                \
+  v16i8 tmp0_m;                                           \
+  uint8_t *dst_m = (uint8_t *)(pdst);                     \
+                                                          \
+  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));     \
+  tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128);       \
+                                                          \
+  out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0);              \
+  out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1);              \
+  out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2);              \
+  out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3);              \
+                                                          \
+  STORE_WORD(dst_m, out0_m);                              \
+  dst_m += stride;                                        \
+  STORE_WORD(dst_m, out1_m);                              \
+  dst_m += stride;                                        \
+  STORE_WORD(dst_m, out2_m);                              \
+  dst_m += stride;                                        \
+  STORE_WORD(dst_m, out3_m);                              \
+}
+
+#define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2,        \
+                                          in3, in4,        \
+                                          pdst, stride) {  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                 \
+  v16i8 tmp0_m, tmp1_m;                                    \
+  uint8_t *dst_m = (uint8_t *)(pdst);                      \
+                                                           \
+  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));      \
+  tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3));      \
+                                                           \
+  tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128);        \
+  tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128);        \
+                                                           \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);               \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);               \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);               \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);               \
+                                                           \
+  STORE_DWORD(dst_m, out0_m);                              \
+  dst_m += stride;                                         \
+  STORE_DWORD(dst_m, out1_m);                              \
+  dst_m += stride;                                         \
+  STORE_DWORD(dst_m, out2_m);                              \
+  dst_m += stride;                                         \
+  STORE_DWORD(dst_m, out3_m);                              \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) {  \
+  v16i8 tmp_m;                                        \
+                                                      \
+  tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
+  tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128);     \
+  STORE_SB(tmp_m, (pdest));                           \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0,       \
+                                              in2, dst1,       \
+                                              in3, dst2,       \
+                                              in4, dst3,       \
+                                              pdst, stride) {  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                     \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                        \
+  uint8_t *dst_m = (uint8_t *)(pdst);                          \
+                                                               \
+  tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1));   \
+  tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3));   \
+                                                               \
+  tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0));  \
+  tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2));  \
+                                                               \
+  tmp0_m = __msa_xori_b(tmp0_m, 128);                          \
+  tmp1_m = __msa_xori_b(tmp1_m, 128);                          \
+                                                               \
+  tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m);                     \
+  tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m);                     \
+                                                               \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                   \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                   \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                   \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                   \
+                                                               \
+  STORE_DWORD(dst_m, out0_m);                                  \
+  dst_m += stride;                                             \
+  STORE_DWORD(dst_m, out1_m);                                  \
+  dst_m += stride;                                             \
+  STORE_DWORD(dst_m, out2_m);                                  \
+  dst_m += stride;                                             \
+  STORE_DWORD(dst_m, out3_m);                                  \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) {  \
+  v16u8 tmp_m;                                                 \
+                                                               \
+  tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2));    \
+  tmp_m = __msa_xori_b(tmp_m, 128);                            \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst));                 \
+  STORE_UB(tmp_m, (pdest));                                    \
+}
+
+#define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4,    \
+                                pdst, stride) {        \
+  uint64_t out0_m, out1_m, out2_m, out3_m;             \
+  v16i8 tmp0_m, tmp1_m;                                \
+  uint8_t *dst_m = (uint8_t *)(pdst);                  \
+                                                       \
+  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));  \
+  tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3));  \
+                                                       \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);           \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);           \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);           \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);           \
+                                                       \
+  STORE_DWORD(dst_m, out0_m);                          \
+  dst_m += stride;                                     \
+  STORE_DWORD(dst_m, out1_m);                          \
+  dst_m += stride;                                     \
+  STORE_DWORD(dst_m, out2_m);                          \
+  dst_m += stride;                                     \
+  STORE_DWORD(dst_m, out3_m);                          \
+}
+
+/* Only for unsigned vecs */
+#define PCKEV_B_STORE_VEC(in1, in2, pdest) {          \
+  v16i8 tmp_m;                                        \
+                                                      \
+  tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
+  STORE_SB(tmp_m, (pdest));                           \
+}
+
+#define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1,       \
+                                    in3, dst2, in4, dst3,       \
+                                    pdst, stride) {             \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                      \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                         \
+  uint8_t *dst_m = (uint8_t *)(pdst);                           \
+                                                                \
+  tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1));    \
+  tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3));    \
+                                                                \
+  tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0));  \
+  tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2));  \
+                                                                \
+  tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m);                      \
+  tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m);                      \
+                                                                \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                    \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                    \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                    \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                    \
+                                                                \
+  STORE_DWORD(dst_m, out0_m);                                   \
+  dst_m += stride;                                              \
+  STORE_DWORD(dst_m, out1_m);                                   \
+  dst_m += stride;                                              \
+  STORE_DWORD(dst_m, out2_m);                                   \
+  dst_m += stride;                                              \
+  STORE_DWORD(dst_m, out3_m);                                   \
+}
+
+#define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) {        \
+  v16u8 tmp_m;                                               \
+                                                             \
+  tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst));               \
+  STORE_UB(tmp_m, (pdest));                                  \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_4(in0, in1, in2, in3,        \
+                    out0, out1, out2, out3) {  \
+  out0 = (in0) + (in3);                        \
+  out1 = (in1) + (in2);                        \
+                                               \
+  out2 = (in1) - (in2);                        \
+  out3 = (in0) - (in3);                        \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_8(in0, in1, in2, in3,        \
+                    in4, in5, in6, in7,        \
+                    out0, out1, out2, out3,    \
+                    out4, out5, out6, out7) {  \
+  out0 = (in0) + (in7);                        \
+  out1 = (in1) + (in6);                        \
+  out2 = (in2) + (in5);                        \
+  out3 = (in3) + (in4);                        \
+                                               \
+  out4 = (in3) - (in4);                        \
+  out5 = (in2) - (in5);                        \
+  out6 = (in1) - (in6);                        \
+  out7 = (in0) - (in7);                        \
+}
+#endif  /* HAVE_MSA */
+#endif  /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
index 8b3b9dbe0f7..7db210c3ae0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.c
@@ -17,15 +17,22 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-static void clear_mi_border(const VP9_COMMON *cm, MODE_INFO *mi) {
-  int i;
-
-  // Top border row
-  vpx_memset(mi, 0, sizeof(*mi) * cm->mi_stride);
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
+}
 
-  // Left border column
-  for (i = 1; i < cm->mi_rows + 1; ++i)
-    vpx_memset(&mi[i * cm->mi_stride], 0, sizeof(*mi));
+void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pool->pool_mutex);
+#else
+  (void)pool;
+#endif
 }
 
 void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
@@ -41,73 +48,70 @@ void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
   cm->MBs = cm->mb_rows * cm->mb_cols;
 }
 
-static void setup_mi(VP9_COMMON *cm) {
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
-
-  vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
-  clear_mi_border(cm, cm->prev_mip);
-}
-
-static int alloc_mi(VP9_COMMON *cm, int mi_size) {
+static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
   int i;
 
-  for (i = 0; i < 2; ++i) {
-    cm->mip_array[i] =
-        (MODE_INFO *)vpx_calloc(mi_size, sizeof(MODE_INFO));
-    if (cm->mip_array[i] == NULL)
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    cm->seg_map_array[i] = (uint8_t *)vpx_calloc(seg_map_size, 1);
+    if (cm->seg_map_array[i] == NULL)
       return 1;
   }
 
-  cm->mi_alloc_size = mi_size;
-
   // Init the index.
-  cm->mi_idx = 0;
-  cm->prev_mi_idx = 1;
+  cm->seg_map_idx = 0;
+  cm->prev_seg_map_idx = 1;
 
-  cm->mip = cm->mip_array[cm->mi_idx];
-  cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  if (!cm->frame_parallel_decode)
+    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 
   return 0;
 }
 
-static void free_mi(VP9_COMMON *cm) {
+static void free_seg_map(VP9_COMMON *cm) {
   int i;
 
-  for (i = 0; i < 2; ++i) {
-    vpx_free(cm->mip_array[i]);
-    cm->mip_array[i] = NULL;
+  for (i = 0; i < NUM_PING_PONG_BUFFERS; ++i) {
+    vpx_free(cm->seg_map_array[i]);
+    cm->seg_map_array[i] = NULL;
   }
 
-  cm->mip = NULL;
-  cm->prev_mip = NULL;
+  cm->current_frame_seg_map = NULL;
+
+  if (!cm->frame_parallel_decode) {
+    cm->last_frame_seg_map = NULL;
+  }
 }
 
-void vp9_free_ref_frame_buffers(VP9_COMMON *cm) {
+void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    vp9_free_frame_buffer(&cm->frame_bufs[i].buf);
-
-    if (cm->frame_bufs[i].ref_count > 0 &&
-        cm->frame_bufs[i].raw_frame_buffer.data != NULL) {
-      cm->release_fb_cb(cm->cb_priv, &cm->frame_bufs[i].raw_frame_buffer);
-      cm->frame_bufs[i].ref_count = 0;
+    if (pool->frame_bufs[i].ref_count > 0 &&
+        pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+      pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+      pool->frame_bufs[i].ref_count = 0;
     }
+    vpx_free(pool->frame_bufs[i].mvs);
+    pool->frame_bufs[i].mvs = NULL;
+    vp9_free_frame_buffer(&pool->frame_bufs[i].buf);
   }
+}
 
+void vp9_free_postproc_buffers(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
   vp9_free_frame_buffer(&cm->post_proc_buffer);
+  vp9_free_frame_buffer(&cm->post_proc_buffer_int);
+#else
+  (void)cm;
+#endif
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  free_mi(cm);
-
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = NULL;
-
+  cm->free_mi(cm);
+  free_seg_map(cm);
   vpx_free(cm->above_context);
   cm->above_context = NULL;
-
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
 }
@@ -116,11 +120,13 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   vp9_free_context_buffers(cm);
 
   vp9_set_mb_mi(cm, width, height);
-  if (alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
+  if (cm->alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
     goto fail;
 
-  cm->last_frame_seg_map = (uint8_t *)vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
-  if (!cm->last_frame_seg_map) goto fail;
+  // Create the segmentation map structure and set to 0.
+  free_seg_map(cm);
+  if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+    goto fail;
 
   cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
       2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
@@ -138,77 +144,27 @@ int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
   return 1;
 }
 
-static void init_frame_bufs(VP9_COMMON *cm) {
-  int i;
-
-  cm->new_fb_idx = FRAME_BUFFERS - 1;
-  cm->frame_bufs[cm->new_fb_idx].ref_count = 1;
-
-  for (i = 0; i < REF_FRAMES; ++i) {
-    cm->ref_frame_map[i] = i;
-    cm->frame_bufs[i].ref_count = 1;
-  }
-}
-
-int vp9_alloc_ref_frame_buffers(VP9_COMMON *cm, int width, int height) {
-  int i;
-  const int ss_x = cm->subsampling_x;
-  const int ss_y = cm->subsampling_y;
-
-  vp9_free_ref_frame_buffers(cm);
-
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    cm->frame_bufs[i].ref_count = 0;
-    if (vp9_alloc_frame_buffer(&cm->frame_bufs[i].buf, width, height,
-                               ss_x, ss_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif
-                               VP9_ENC_BORDER_IN_PIXELS) < 0)
-      goto fail;
-  }
-
-  init_frame_bufs(cm);
-
-#if CONFIG_INTERNAL_STATS || CONFIG_VP9_POSTPROC
-  if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                             cm->use_highbitdepth,
-#endif
-                             VP9_ENC_BORDER_IN_PIXELS) < 0)
-    goto fail;
-#endif
-
-  return 0;
-
- fail:
-  vp9_free_ref_frame_buffers(cm);
-  return 1;
-}
-
 void vp9_remove_common(VP9_COMMON *cm) {
-  vp9_free_ref_frame_buffers(cm);
   vp9_free_context_buffers(cm);
-  vp9_free_internal_frame_buffers(&cm->int_frame_buffers);
+
+  vpx_free(cm->fc);
+  cm->fc = NULL;
+  vpx_free(cm->frame_contexts);
+  cm->frame_contexts = NULL;
 }
 
 void vp9_init_context_buffers(VP9_COMMON *cm) {
-  setup_mi(cm);
-  if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
+  cm->setup_mi(cm);
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
 
-void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+void vp9_swap_current_and_last_seg_map(VP9_COMMON *cm) {
   // Swap indices.
-  const int tmp = cm->mi_idx;
-  cm->mi_idx = cm->prev_mi_idx;
-  cm->prev_mi_idx = tmp;
-
-  // Current mip will be the prev_mip for the next frame.
-  cm->mip = cm->mip_array[cm->mi_idx];
-  cm->prev_mip = cm->mip_array[cm->prev_mi_idx];
+  const int tmp = cm->seg_map_idx;
+  cm->seg_map_idx = cm->prev_seg_map_idx;
+  cm->prev_seg_map_idx = tmp;
 
-  // Update the upper left visible macroblock ptrs.
-  cm->mi = cm->mip + cm->mi_stride + 1;
-  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
index c5b893facca..c0e51a6ce64 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_alloccommon.h
@@ -12,11 +12,14 @@
 #ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
 #define VP9_COMMON_VP9_ALLOCCOMMON_H_
 
+#define INVALID_IDX -1  // Invalid buffer index.
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 struct VP9Common;
+struct BufferPool;
 
 void vp9_remove_common(struct VP9Common *cm);
 
@@ -24,14 +27,15 @@ int vp9_alloc_context_buffers(struct VP9Common *cm, int width, int height);
 void vp9_init_context_buffers(struct VP9Common *cm);
 void vp9_free_context_buffers(struct VP9Common *cm);
 
-int vp9_alloc_ref_frame_buffers(struct VP9Common *cm, int width, int height);
-void vp9_free_ref_frame_buffers(struct VP9Common *cm);
+void vp9_free_ref_frame_buffers(struct BufferPool *pool);
+void vp9_free_postproc_buffers(struct VP9Common *cm);
 
 int vp9_alloc_state_buffers(struct VP9Common *cm, int width, int height);
 void vp9_free_state_buffers(struct VP9Common *cm);
 
 void vp9_set_mb_mi(struct VP9Common *cm, int width, int height);
-void vp9_swap_mi_and_prev_mi(struct VP9Common *cm);
+
+void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
index 7094a0118c0..b2bb1818893 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.c
@@ -40,7 +40,7 @@ void vp9_foreach_transformed_block_in_plane(
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg) {
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MB_MODE_INFO* mbmi = &xd->mi[0].src_mi->mbmi;
+  const MB_MODE_INFO* mbmi = &xd->mi[0]->mbmi;
   // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
   // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
   // transform size varies per plane, look it up in a common way.
@@ -50,39 +50,25 @@ void vp9_foreach_transformed_block_in_plane(
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const int step = 1 << (tx_size << 1);
-  int i;
+  int i = 0, r, c;
 
   // If mb_to_right_edge is < 0 we are in a situation in which
   // the current block size extends into the UMV and we won't
   // visit the sub blocks that are wholly within the UMV.
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    int r, c;
-
-    int max_blocks_wide = num_4x4_w;
-    int max_blocks_high = num_4x4_h;
-
-    // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-    // it to 4x4 block sizes.
-    if (xd->mb_to_right_edge < 0)
-      max_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-    if (xd->mb_to_bottom_edge < 0)
-      max_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-    i = 0;
-    // Unlike the normal case - in here we have to keep track of the
-    // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border.
-    for (r = 0; r < num_4x4_h; r += (1 << tx_size)) {
-      for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
-        if (r < max_blocks_high && c < max_blocks_wide)
-          visit(plane, i, plane_bsize, tx_size, arg);
-        i += step;
-      }
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += (1 << tx_size)) {
+    for (c = 0; c < num_4x4_w; c += (1 << tx_size)) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      if (c < max_blocks_wide)
+        visit(plane, i, plane_bsize, tx_size, arg);
+      i += step;
     }
-  } else {
-    for (i = 0; i < num_4x4_w * num_4x4_h; i += step)
-      visit(plane, i, plane_bsize, tx_size, arg);
   }
 }
 
@@ -117,7 +103,7 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
     for (i = above_contexts; i < tx_size_in_blocks; ++i)
       a[i] = 0;
   } else {
-    vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 
   // left
@@ -134,7 +120,7 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
     for (i = left_contexts; i < tx_size_in_blocks; ++i)
       l[i] = 0;
   } else {
-    vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
index 1234d54c7f0..018a9c2b975 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_blockd.h
@@ -126,10 +126,10 @@ typedef struct {
   int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
   uint8_t mode_context[MAX_REF_FRAMES];
   INTERP_FILTER interp_filter;
+
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
-  struct MODE_INFO *src_mi;
   MB_MODE_INFO mbmi;
   b_mode_info bmi[4];
 } MODE_INFO;
@@ -190,7 +190,11 @@ typedef struct macroblockd {
 
   int mi_stride;
 
-  MODE_INFO *mi;
+  MODE_INFO **mi;
+  MODE_INFO *left_mi;
+  MODE_INFO *above_mi;
+  MB_MODE_INFO *left_mbmi;
+  MB_MODE_INFO *above_mbmi;
 
   int up_available;
   int left_available;
@@ -207,6 +211,12 @@ typedef struct macroblockd {
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
 
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+  PARTITION_CONTEXT *above_seg_context;
+  PARTITION_CONTEXT left_seg_context[8];
+
   /* mc buffer */
   DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
 
@@ -216,17 +226,13 @@ typedef struct macroblockd {
   DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
 #endif
 
-  int lossless;
+  /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
 
+  int lossless;
   int corrupted;
 
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
-
-  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
-  PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
+  struct vpx_internal_error_info *error_info;
 } MACROBLOCKD;
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
@@ -238,16 +244,17 @@ extern const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES];
 
 static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
                                   const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
-  if (plane_type != PLANE_TYPE_Y || is_inter_block(mbmi))
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
     return DCT_DCT;
+
   return intra_mode_to_tx_type_lookup[mbmi->mode];
 }
 
 static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd, int ib) {
-  const MODE_INFO *const mi = xd->mi[0].src_mi;
+  const MODE_INFO *const mi = xd->mi[0];
 
   if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(&mi->mbmi))
     return DCT_DCT;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
index 6801dd3a2b6..d06b8e0405e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_common.h
@@ -36,17 +36,17 @@ extern "C" {
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src) {            \
     assert(sizeof(dest) == sizeof(src)); \
-    vpx_memcpy(dest, src, sizeof(src));  \
+    memcpy(dest, src, sizeof(src));  \
   }
 
 // Use this for variably-sized arrays.
 #define vp9_copy_array(dest, src, n) {       \
     assert(sizeof(*dest) == sizeof(*src));   \
-    vpx_memcpy(dest, src, n * sizeof(*src)); \
+    memcpy(dest, src, n * sizeof(*src)); \
   }
 
-#define vp9_zero(dest) vpx_memset(&(dest), 0, sizeof(dest))
-#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest))
+#define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
 
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
index 7b65651ba88..90e337fd66e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_convolve.c
@@ -236,7 +236,7 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64);
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
@@ -256,7 +256,7 @@ void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
   (void)filter_y;  (void)filter_y_stride;
 
   for (r = h; r > 0; --r) {
-    vpx_memcpy(dst, src, w);
+    memcpy(dst, src, w);
     src += src_stride;
     dst += dst_stride;
   }
@@ -501,7 +501,7 @@ void vp9_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp, 64 * 64);
+  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
@@ -526,7 +526,7 @@ void vp9_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
   (void)bd;
 
   for (r = h; r > 0; --r) {
-    vpx_memcpy(dst, src, w * sizeof(uint16_t));
+    memcpy(dst, src, w * sizeof(uint16_t));
     src += src_stride;
     dst += dst_stride;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
index d9dace6ac8a..3d80103d21b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_debugmodes.c
@@ -25,55 +25,65 @@ static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
 static void print_mi_data(VP9_COMMON *cm, FILE *file, const char *descriptor,
                           size_t member_offset) {
   int mi_row, mi_col;
-  int mi_index = 0;
-  // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = &cm->mi;
+  MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
   char prefix = descriptor[0];
 
   log_frame_info(cm, descriptor, file);
-  mi_index = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(file, "%c ", prefix);
     for (mi_col = 0; mi_col < cols; mi_col++) {
       fprintf(file, "%2d ",
-              *((int*) ((char *) (&mi[mi_index]->mbmi) +
-                        member_offset)));
-      mi_index++;
+              *((int*) ((char *) (&mi[0]->mbmi) +
+                                  member_offset)));
+      mi++;
     }
     fprintf(file, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(file, "\n");
 }
+
 void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, const char *file) {
   int mi_row;
   int mi_col;
-  int mi_index = 0;
   FILE *mvs = fopen(file, "a");
-  // TODO(hkuang): Fix this debug function.
-  MODE_INFO **mi = &cm->mi;
+  MODE_INFO **mi = cm->mi_grid_visible;
   int rows = cm->mi_rows;
   int cols = cm->mi_cols;
 
   print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
   print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
-  print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, skip));
   print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
   print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
   print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
 
+  // output skip infomation.
+  log_frame_info(cm, "Skips:", mvs);
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    fprintf(mvs, "S ");
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      fprintf(mvs, "%2d ", mi[0]->mbmi.skip);
+      mi++;
+    }
+    fprintf(mvs, "\n");
+    mi += 8;
+  }
+  fprintf(mvs, "\n");
+
+  // output motion vectors.
   log_frame_info(cm, "Vectors ", mvs);
+  mi = cm->mi_grid_visible;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     fprintf(mvs, "V ");
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      fprintf(mvs, "%4d:%4d ", mi[mi_index]->mbmi.mv[0].as_mv.row,
-                               mi[mi_index]->mbmi.mv[0].as_mv.col);
-      mi_index++;
+      fprintf(mvs, "%4d:%4d ", mi[0]->mbmi.mv[0].as_mv.row,
+                               mi[0]->mbmi.mv[0].as_mv.col);
+      mi++;
     }
     fprintf(mvs, "\n");
-    mi_index += 8;
+    mi += 8;
   }
   fprintf(mvs, "\n");
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
index c3fdeb48a6c..a2584e8da5b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.c
@@ -15,6 +15,18 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+
 const vp9_prob vp9_cat1_prob[] = { 159 };
 const vp9_prob vp9_cat2_prob[] = { 165, 145 };
 const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 };
@@ -737,21 +749,21 @@ static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
 };
 
 static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
-  vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
-             MODEL_NODES * sizeof(vp9_prob));
+  memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
+         MODEL_NODES * sizeof(vp9_prob));
 }
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
   if (full != model)
-    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+    memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
 void vp9_default_coef_probs(VP9_COMMON *cm) {
-  vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
-  vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
-  vp9_copy(cm->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
-  vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
+  vp9_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
 #define COEF_COUNT_SAT 24
@@ -765,7 +777,7 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                              unsigned int count_sat,
                              unsigned int update_factor) {
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-  vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size];
+  vp9_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
   const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
   vp9_coeff_count_model *counts = cm->counts.coef[tx_size];
   unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
index 239c0494c63..5a9007b5417 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropy.h
@@ -81,6 +81,7 @@ typedef struct {
   const vp9_prob *prob;
   int len;
   int base_val;
+  const int16_t *cost;
 } vp9_extra_bit;
 
 // indexed by token value
@@ -141,10 +142,10 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
   for (i = 0; i < MAX_MB_PLANE; i++) {
     struct macroblockd_plane *const pd = &xd->plane[i];
     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    vpx_memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) *
-                   num_4x4_blocks_wide_lookup[plane_bsize]);
-    vpx_memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) *
-                   num_4x4_blocks_high_lookup[plane_bsize]);
+    memset(pd->above_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
+    memset(pd->left_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
   }
 }
 
@@ -172,6 +173,7 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 #define PIVOT_NODE                  2   // which node is pivot
 
 #define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
 extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
 
 typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
@@ -214,7 +216,7 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
 
 static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                                          PLANE_TYPE type, int block_idx) {
-  const MODE_INFO *const mi = xd->mi[0].src_mi;
+  const MODE_INFO *const mi = xd->mi[0];
 
   if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
     return &vp9_default_scan_orders[tx_size];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
index 5b00b0082a1..424451fee39 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.c
@@ -334,60 +334,48 @@ const vp9_tree_index vp9_switchable_interp_tree
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
 
-#define COUNT_SAT 20
-#define MAX_UPDATE_FACTOR 128
-
-static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
-  return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree,
-                        const vp9_prob *pre_probs, const unsigned int *counts,
-                        vp9_prob *probs) {
-  vp9_tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
-                   probs);
-}
-
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
-  FRAME_CONTEXT *fc = &cm->fc;
+  FRAME_CONTEXT *fc = cm->fc;
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
-                                         counts->intra_inter[i]);
+    fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
+                                                  counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
-                                        counts->comp_inter[i]);
+    fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
+                                                 counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
-                                      counts->comp_ref[i]);
+    fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
+                                               counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
-                                             counts->single_ref[i][j]);
+      fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+    vp9_tree_merge_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
                 counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
                 counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
-                counts->uv_mode[i], fc->uv_mode_prob[i]);
+    vp9_tree_merge_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                         counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
-                counts->partition[i], fc->partition_prob[i]);
+    vp9_tree_merge_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
 
   if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
-                  counts->switchable_interp[i], fc->switchable_interp_prob[i]);
+      vp9_tree_merge_probs(vp9_switchable_interp_tree,
+                           pre_fc->switchable_interp_prob[i],
+                           counts->switchable_interp[i],
+                           fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -399,23 +387,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
-                                             branch_ct_8x8p[j]);
+        fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
-                                               branch_ct_16x16p[j]);
+        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
-                                               branch_ct_32x32p[j]);
+        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
+            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
     }
   }
 
   for (i = 0; i < SKIP_CONTEXTS; ++i)
-    fc->skip_probs[i] = adapt_prob(pre_fc->skip_probs[i], counts->skip[i]);
+    fc->skip_probs[i] = mode_mv_merge_probs(
+        pre_fc->skip_probs[i], counts->skip[i]);
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -439,8 +428,12 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   int i;
   vp9_clearall_segfeatures(&cm->seg);
   cm->seg.abs_delta = SEGMENT_DELTADATA;
-  if (cm->last_frame_seg_map)
-    vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+    memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+  if (cm->current_frame_seg_map)
+    memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   // Reset the mode ref deltas for loop filter
   vp9_zero(lf->last_ref_deltas);
@@ -451,24 +444,24 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
-  vp9_init_mode_probs(&cm->fc);
+  vp9_init_mode_probs(cm->fc);
   vp9_init_mv_probs(cm);
+  cm->fc->initialized = 1;
 
   if (cm->frame_type == KEY_FRAME ||
       cm->error_resilient_mode || cm->reset_frame_context == 3) {
     // Reset all frame contexts.
     for (i = 0; i < FRAME_CONTEXTS; ++i)
-      cm->frame_contexts[i] = cm->fc;
+      cm->frame_contexts[i] = *cm->fc;
   } else if (cm->reset_frame_context == 2) {
     // Reset only the frame context specified in the frame header.
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
   }
 
-  if (frame_is_intra_only(cm))
-    vpx_memset(cm->prev_mip, 0, cm->mi_stride * (cm->mi_rows + 1) *
-                                    sizeof(*cm->prev_mip));
-
-  vpx_memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  // prev_mip will only be allocated in encoder.
+  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+    memset(cm->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
 
   vp9_zero(cm->ref_frame_sign_bias);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
index 6831d3f8738..f4e20e1af8b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymode.h
@@ -33,6 +33,7 @@ struct tx_counts {
   unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
   unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
   unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  unsigned int tx_totals[TX_SIZES];
 };
 
 typedef struct frame_contexts {
@@ -50,9 +51,10 @@ typedef struct frame_contexts {
   struct tx_probs tx_probs;
   vp9_prob skip_probs[SKIP_CONTEXTS];
   nmv_context nmvc;
+  int initialized;
 } FRAME_CONTEXT;
 
-typedef struct {
+typedef struct FRAME_COUNTS {
   unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
   unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
   unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
index 5bb048202b7..2477e6ef326 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_entropymv.c
@@ -11,9 +11,6 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_entropymv.h"
 
-#define MV_COUNT_SAT 20
-#define MV_MAX_UPDATE_FACTOR 128
-
 // Integer pel reference mv threshold for use of high-precision 1/8 mv
 #define COMPANDED_MVREF_THRESH 8
 
@@ -183,51 +180,43 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
   }
 }
 
-static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
-}
-
-static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                        const unsigned int *counts, vp9_prob *probs) {
-  vp9_tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT,
-                       MV_MAX_UPDATE_FACTOR, probs);
-}
-
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  nmv_context *fc = &cm->fc.nmvc;
+  nmv_context *fc = &cm->fc->nmvc;
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
 
-  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
+  vp9_tree_merge_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+                       fc->joints);
 
   for (i = 0; i < 2; ++i) {
     nmv_component *comp = &fc->comps[i];
     const nmv_component *pre_comp = &pre_fc->comps[i];
     const nmv_component_counts *c = &counts->comps[i];
 
-    comp->sign = adapt_prob(pre_comp->sign, c->sign);
-    adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
-                comp->classes);
-    adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
+    comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+    vp9_tree_merge_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                         comp->classes);
+    vp9_tree_merge_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0,
+                         comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
+      comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
-                  comp->class0_fp[j]);
+      vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j],
+                           c->class0_fp[j], comp->class0_fp[j]);
 
-    adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+    vp9_tree_merge_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
-      comp->hp = adapt_prob(pre_comp->hp, c->hp);
+      comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
     }
   }
 }
 
 void vp9_init_mv_probs(VP9_COMMON *cm) {
-  cm->fc.nmvc = default_nmv_context;
+  cm->fc->nmvc = default_nmv_context;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
index f83d21fe3a3..7938fc10a11 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_enums.h
@@ -99,17 +99,6 @@ typedef enum {
 } TX_TYPE;
 
 typedef enum {
-  UNKNOWN    = 0,
-  BT_601     = 1,  // YUV
-  BT_709     = 2,  // YUV
-  SMPTE_170  = 3,  // YUV
-  SMPTE_240  = 4,  // YUV
-  RESERVED_1 = 5,
-  RESERVED_2 = 6,
-  SRGB       = 7   // RGB
-} COLOR_SPACE;
-
-typedef enum {
   VP9_LAST_FLAG = 1 << 0,
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
index 3377d45fc0e..d963ee23569 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_filter.h
@@ -31,17 +31,14 @@ typedef enum {
   EIGHTTAP = 0,
   EIGHTTAP_SMOOTH = 1,
   EIGHTTAP_SHARP = 2,
+  SWITCHABLE_FILTERS = 3, /* Number of switchable filters */
   BILINEAR = 3,
+  // The codec can operate in four possible inter prediction filter mode:
+  // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+  SWITCHABLE_FILTER_CONTEXTS = SWITCHABLE_FILTERS + 1,
   SWITCHABLE = 4  /* should be the last one */
 } INTERP_FILTER;
 
-// Number of switchable filters
-#define SWITCHABLE_FILTERS 3
-
-// The codec can operate in four possible inter prediction filter mode:
-// 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
-#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
-
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
 const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
index 34795b74ec2..0f41d66985f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c
@@ -64,7 +64,7 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
     // This memset is needed for fixing valgrind error from C loop filter
     // due to access uninitialized memory in frame border. It could be
     // removed if border is totally removed.
-    vpx_memset(int_fb_list->int_fb[i].data, 0, min_size);
+    memset(int_fb_list->int_fb[i].data, 0, min_size);
     int_fb_list->int_fb[i].size = min_size;
   }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
index d5b6f39b3cd..3b214371c47 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c
@@ -11,39 +11,9 @@
 #include <math.h>
 
 #include "./vp9_rtcd.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
-
-#if CONFIG_EMULATE_HARDWARE
-// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
-// non-normative method to handle overflows. A stream that causes
-// overflows  in the inverse transform is considered invalid in VP9,
-// and a hardware implementer is free to choose any reasonable
-// method to handle overflows. However to aid in hardware
-// verification they can use a specific implementation of the
-// WRAPLOW() macro below that is identical to their intended
-// hardware implementation (and also use configure options to trigger
-// the C-implementation of the transform).
-//
-// The particular WRAPLOW implementation below performs strict
-// overflow wrapping to match common hardware implementations.
-// bd of 8 uses trans_low with 16bits, need to remove 16bits
-// bd of 10 uses trans_low with 18bits, need to remove 14bits
-// bd of 12 uses trans_low with 20bits, need to remove 12bits
-// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
-#else
-#define WRAPLOW(x, bd) (x)
-#endif  // CONFIG_EMULATE_HARDWARE
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
-                                             int bd) {
-  trans = WRAPLOW(trans, bd);
-  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_systemdependent.h"
 
 static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
   trans = WRAPLOW(trans, 8);
@@ -276,10 +246,10 @@ void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 static void iadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  tran_high_t x0 = input[0];
-  tran_high_t x1 = input[1];
-  tran_high_t x2 = input[2];
-  tran_high_t x3 = input[3];
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
 
   if (!(x0 | x1 | x2 | x3)) {
     output[0] = output[1] = output[2] = output[3] = 0;
@@ -295,24 +265,19 @@ static void iadst4(const tran_low_t *input, tran_low_t *output) {
   s6 = sinpi_4_9 * x3;
   s7 = x0 - x2 + x3;
 
-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
 
   // 1-D transform scaling factor is sqrt(2).
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0), 8);
-  output[1] = WRAPLOW(dct_const_round_shift(s1), 8);
+  output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8);
+  output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8);
   output[2] = WRAPLOW(dct_const_round_shift(s2), 8);
-  output[3] = WRAPLOW(dct_const_round_shift(s3), 8);
+  output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8);
 }
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
@@ -367,14 +332,14 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) {
   }
 
   // stage 1
-  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
-  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+  s0 = (int)(cospi_2_64  * x0 + cospi_30_64 * x1);
+  s1 = (int)(cospi_30_64 * x0 - cospi_2_64  * x1);
+  s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+  s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+  s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+  s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+  s6 = (int)(cospi_26_64 * x6 + cospi_6_64  * x7);
+  s7 = (int)(cospi_6_64  * x6 - cospi_26_64 * x7);
 
   x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8);
   x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8);
@@ -386,14 +351,14 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) {
   x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8);
 
   // stage 2
-  s0 = x0;
-  s1 = x1;
-  s2 = x2;
-  s3 = x3;
-  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
-  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
-  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+  s0 = (int)x0;
+  s1 = (int)x1;
+  s2 = (int)x2;
+  s3 = (int)x3;
+  s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+  s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+  s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+  s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
 
   x0 = WRAPLOW(s0 + s2, 8);
   x1 = WRAPLOW(s1 + s3, 8);
@@ -405,10 +370,10 @@ static void iadst8(const tran_low_t *input, tran_low_t *output) {
   x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8);
 
   // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
+  s2 = (int)(cospi_16_64 * (x2 + x3));
+  s3 = (int)(cospi_16_64 * (x2 - x3));
+  s6 = (int)(cospi_16_64 * (x6 + x7));
+  s7 = (int)(cospi_16_64 * (x6 - x7));
 
   x2 = WRAPLOW(dct_const_round_shift(s2), 8);
   x3 = WRAPLOW(dct_const_round_shift(s3), 8);
@@ -1311,7 +1276,7 @@ void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
     if (zero_coeff[0] | zero_coeff[1])
       idct32(input, outptr);
     else
-      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
     outptr += 32;
   }
@@ -1545,19 +1510,19 @@ void vp9_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
   }
 }
 
-static void highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step[4];
   tran_high_t temp1, temp2;
   (void) bd;
   // stage 1
   temp1 = (input[0] + input[2]) * cospi_16_64;
   temp2 = (input[0] - input[2]) * cospi_16_64;
-  step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
   temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
-  step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 2
   output[0] = WRAPLOW(step[0] + step[3], bd);
@@ -1576,7 +1541,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // Rows
   for (i = 0; i < 4; ++i) {
-    highbd_idct4(input, outptr, bd);
+    vp9_highbd_idct4(input, outptr, bd);
     input += 4;
     outptr += 4;
   }
@@ -1585,7 +1550,7 @@ void vp9_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
   for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j)
       temp_in[j] = out[j * 4 + i];
-    highbd_idct4(temp_in, temp_out, bd);
+    vp9_highbd_idct4(temp_in, temp_out, bd);
     for (j = 0; j < 4; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
@@ -1597,10 +1562,11 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int dest_stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1612,7 +1578,7 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[8], step2[8];
   tran_high_t temp1, temp2;
   // stage 1
@@ -1622,15 +1588,15 @@ static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[3] = input[6];
   temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
   temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
   temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 2 & stage 3 - even half
-  highbd_idct4(step1, step1, bd);
+  vp9_highbd_idct4(step1, step1, bd);
 
   // stage 2 - odd half
   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
@@ -1642,8 +1608,8 @@ static void highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[7] = step2[7];
 
   // stage 4
@@ -1667,7 +1633,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // First transform rows.
   for (i = 0; i < 8; ++i) {
-    highbd_idct8(input, outptr, bd);
+    vp9_highbd_idct8(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
@@ -1676,7 +1642,7 @@ void vp9_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    highbd_idct8(temp_in, temp_out, bd);
+    vp9_highbd_idct8(temp_in, temp_out, bd);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1688,9 +1654,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i)
@@ -1702,14 +1669,14 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
 static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  tran_high_t x0 = input[0];
-  tran_high_t x1 = input[1];
-  tran_high_t x2 = input[2];
-  tran_high_t x3 = input[3];
+  tran_low_t x0 = input[0];
+  tran_low_t x1 = input[1];
+  tran_low_t x2 = input[2];
+  tran_low_t x3 = input[3];
   (void) bd;
 
   if (!(x0 | x1 | x2 | x3)) {
-    vpx_memset(output, 0, 4 * sizeof(*output));
+    memset(output, 0, 4 * sizeof(*output));
     return;
   }
 
@@ -1720,34 +1687,29 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
   s4 = sinpi_1_9 * x2;
   s5 = sinpi_2_9 * x3;
   s6 = sinpi_4_9 * x3;
-  s7 = x0 - x2 + x3;
+  s7 = (tran_high_t)(x0 - x2 + x3);
 
-  x0 = s0 + s3 + s5;
-  x1 = s1 - s4 - s6;
-  x2 = sinpi_3_9 * s7;
-  x3 = s2;
-
-  s0 = x0 + x3;
-  s1 = x1 + x3;
-  s2 = x2;
-  s3 = x0 + x1 - x3;
+  s0 = s0 + s3 + s5;
+  s1 = s1 - s4 - s6;
+  s3 = s2;
+  s2 = sinpi_3_9 * s7;
 
   // 1-D transform scaling factor is sqrt(2).
   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
   // + 1b (addition) = 29b.
   // Hence the output bit depth is 15b.
-  output[0] = WRAPLOW(dct_const_round_shift(s0), bd);
-  output[1] = WRAPLOW(dct_const_round_shift(s1), bd);
-  output[2] = WRAPLOW(dct_const_round_shift(s2), bd);
-  output[3] = WRAPLOW(dct_const_round_shift(s3), bd);
+  output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd);
+  output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd);
+  output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd);
 }
 
 void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
   const highbd_transform_2d IHT_4[] = {
-    { highbd_idct4, highbd_idct4  },    // DCT_DCT  = 0
-    { highbd_iadst4, highbd_idct4 },    // ADST_DCT = 1
-    { highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2
+    { vp9_highbd_idct4, vp9_highbd_idct4  },    // DCT_DCT  = 0
+    { highbd_iadst4, vp9_highbd_idct4 },    // ADST_DCT = 1
+    { vp9_highbd_idct4, highbd_iadst4 },    // DCT_ADST = 2
     { highbd_iadst4, highbd_iadst4 }    // ADST_ADST = 3
   };
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
@@ -1779,18 +1741,18 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
 static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
-  tran_high_t x0 = input[7];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[5];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[3];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[1];
-  tran_high_t x7 = input[6];
+  tran_low_t x0 = input[7];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[5];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[3];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[1];
+  tran_low_t x7 = input[6];
   (void) bd;
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
-    vpx_memset(output, 0, 8 * sizeof(*output));
+    memset(output, 0, 8 * sizeof(*output));
     return;
   }
 
@@ -1804,14 +1766,14 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd);
-  x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd);
 
   // stage 2
   s0 = x0;
@@ -1827,10 +1789,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   x1 = WRAPLOW(s1 + s3, bd);
   x2 = WRAPLOW(s0 - s2, bd);
   x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -1838,10 +1800,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s7), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
 
   output[0] = WRAPLOW(x0, bd);
   output[1] = WRAPLOW(-x4, bd);
@@ -1854,9 +1816,9 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
 }
 
 static const highbd_transform_2d HIGH_IHT_8[] = {
-  { highbd_idct8,  highbd_idct8  },  // DCT_DCT  = 0
-  { highbd_iadst8, highbd_idct8  },  // ADST_DCT = 1
-  { highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2
+  { vp9_highbd_idct8,  vp9_highbd_idct8  },  // DCT_DCT  = 0
+  { highbd_iadst8, vp9_highbd_idct8  },  // ADST_DCT = 1
+  { vp9_highbd_idct8,  highbd_iadst8 },  // DCT_ADST = 2
   { highbd_iadst8, highbd_iadst8 }   // ADST_ADST = 3
 };
 
@@ -1899,7 +1861,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
   // First transform rows.
   // Only first 4 row has non-zero coefs.
   for (i = 0; i < 4; ++i) {
-    highbd_idct8(input, outptr, bd);
+    vp9_highbd_idct8(input, outptr, bd);
     input += 8;
     outptr += 8;
   }
@@ -1907,7 +1869,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
   for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j)
       temp_in[j] = out[j * 8 + i];
-    highbd_idct8(temp_in, temp_out, bd);
+    vp9_highbd_idct8(temp_in, temp_out, bd);
     for (j = 0; j < 8; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
@@ -1915,7 +1877,7 @@ void vp9_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
   }
 }
 
-static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -1950,23 +1912,23 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 3
   step1[0] = step2[0];
@@ -1976,12 +1938,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -1995,12 +1957,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -2010,12 +1972,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -2027,8 +1989,8 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[7] = step2[7];
 
   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2053,12 +2015,12 @@ static void highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2091,7 +2053,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
 
   // First transform rows.
   for (i = 0; i < 16; ++i) {
-    highbd_idct16(input, outptr, bd);
+    vp9_highbd_idct16(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
@@ -2100,7 +2062,7 @@ void vp9_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j * 16 + i];
-    highbd_idct16(temp_in, temp_out, bd);
+    vp9_highbd_idct16(temp_in, temp_out, bd);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2113,27 +2075,27 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
-  tran_high_t x0 = input[15];
-  tran_high_t x1 = input[0];
-  tran_high_t x2 = input[13];
-  tran_high_t x3 = input[2];
-  tran_high_t x4 = input[11];
-  tran_high_t x5 = input[4];
-  tran_high_t x6 = input[9];
-  tran_high_t x7 = input[6];
-  tran_high_t x8 = input[7];
-  tran_high_t x9 = input[8];
-  tran_high_t x10 = input[5];
-  tran_high_t x11 = input[10];
-  tran_high_t x12 = input[3];
-  tran_high_t x13 = input[12];
-  tran_high_t x14 = input[1];
-  tran_high_t x15 = input[14];
+  tran_low_t x0 = input[15];
+  tran_low_t x1 = input[0];
+  tran_low_t x2 = input[13];
+  tran_low_t x3 = input[2];
+  tran_low_t x4 = input[11];
+  tran_low_t x5 = input[4];
+  tran_low_t x6 = input[9];
+  tran_low_t x7 = input[6];
+  tran_low_t x8 = input[7];
+  tran_low_t x9 = input[8];
+  tran_low_t x10 = input[5];
+  tran_low_t x11 = input[10];
+  tran_low_t x12 = input[3];
+  tran_low_t x13 = input[12];
+  tran_low_t x14 = input[1];
+  tran_low_t x15 = input[14];
   (void) bd;
 
   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
            | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
-    vpx_memset(output, 0, 16 * sizeof(*output));
+    memset(output, 0, 16 * sizeof(*output));
     return;
   }
 
@@ -2155,22 +2117,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd);
-  x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd);
-  x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd);
-  x8  = WRAPLOW(dct_const_round_shift(s0 - s8), bd);
-  x9  = WRAPLOW(dct_const_round_shift(s1 - s9), bd);
-  x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd);
-  x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd);
-  x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd);
-  x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd);
+  x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd);
+  x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd);
+  x8  = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd);
+  x9  = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd);
 
   // stage 2
   s0 = x0;
@@ -2198,14 +2160,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   x5 = WRAPLOW(s1 - s5, bd);
   x6 = WRAPLOW(s2 - s6, bd);
   x7 = WRAPLOW(s3 - s7, bd);
-  x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd);
-  x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd);
-  x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd);
-  x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd);
-  x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd);
-  x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+  x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd);
+  x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd);
 
   // stage 3
   s0 = x0;
@@ -2229,18 +2191,18 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   x1 = WRAPLOW(s1 + s3, bd);
   x2 = WRAPLOW(s0 - s2, bd);
   x3 = WRAPLOW(s1 - s3, bd);
-  x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd);
-  x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+  x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd);
+  x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd);
   x8 = WRAPLOW(s8 + s10, bd);
   x9 = WRAPLOW(s9 + s11, bd);
   x10 = WRAPLOW(s8 - s10, bd);
   x11 = WRAPLOW(s9 - s11, bd);
-  x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd);
-  x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd);
+  x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd);
+  x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -2252,14 +2214,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = WRAPLOW(dct_const_round_shift(s2), bd);
-  x3 = WRAPLOW(dct_const_round_shift(s3), bd);
-  x6 = WRAPLOW(dct_const_round_shift(s6), bd);
-  x7 = WRAPLOW(dct_const_round_shift(s7), bd);
-  x10 = WRAPLOW(dct_const_round_shift(s10), bd);
-  x11 = WRAPLOW(dct_const_round_shift(s11), bd);
-  x14 = WRAPLOW(dct_const_round_shift(s14), bd);
-  x15 = WRAPLOW(dct_const_round_shift(s15), bd);
+  x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd);
+  x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd);
+  x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd);
+  x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd);
+  x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd);
+  x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd);
+  x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd);
+  x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd);
 
   output[0] = WRAPLOW(x0, bd);
   output[1] = WRAPLOW(-x8, bd);
@@ -2280,9 +2242,9 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output,
 }
 
 static const highbd_transform_2d HIGH_IHT_16[] = {
-  { highbd_idct16,  highbd_idct16  },  // DCT_DCT  = 0
-  { highbd_iadst16, highbd_idct16  },  // ADST_DCT = 1
-  { highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2
+  { vp9_highbd_idct16,  vp9_highbd_idct16  },  // DCT_DCT  = 0
+  { highbd_iadst16, vp9_highbd_idct16  },  // ADST_DCT = 1
+  { vp9_highbd_idct16,  highbd_iadst16 },  // DCT_ADST = 2
   { highbd_iadst16, highbd_iadst16 }   // ADST_ADST = 3
 };
 
@@ -2325,7 +2287,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
   // First transform rows. Since all non-zero dct coefficients are in
   // upper-left 4x4 area, we only need to calculate first 4 rows here.
   for (i = 0; i < 4; ++i) {
-    highbd_idct16(input, outptr, bd);
+    vp9_highbd_idct16(input, outptr, bd);
     input += 16;
     outptr += 16;
   }
@@ -2334,7 +2296,7 @@ void vp9_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
   for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j)
       temp_in[j] = out[j*16 + i];
-    highbd_idct16(temp_in, temp_out, bd);
+    vp9_highbd_idct16(temp_in, temp_out, bd);
     for (j = 0; j < 16; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2346,10 +2308,11 @@ void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i)
@@ -2383,43 +2346,43 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
   temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
-  step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
   temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
   temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
   temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
   temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
   temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
   temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
   temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   // stage 2
   step2[0] = step1[0];
@@ -2433,23 +2396,23 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
   temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
-  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
   temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
   temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
   temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   step2[16] = WRAPLOW(step1[16] + step1[17], bd);
   step2[17] = WRAPLOW(step1[16] - step1[17], bd);
@@ -2476,12 +2439,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
 
   temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
   temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
-  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
   temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
 
   step1[8] = WRAPLOW(step2[8] + step2[9], bd);
   step1[9] = WRAPLOW(step2[8] - step2[9], bd);
@@ -2496,22 +2459,22 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[31] = step2[31];
   temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
   temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
-  step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
   temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
   temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
   temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
   temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[23] = step2[23];
   step1[24] = step2[24];
   step1[27] = step2[27];
@@ -2520,12 +2483,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   // stage 4
   temp1 = (step1[0] + step1[1]) * cospi_16_64;
   temp2 = (step1[0] - step1[1]) * cospi_16_64;
-  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
   temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
-  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[4] = WRAPLOW(step1[4] + step1[5], bd);
   step2[5] = WRAPLOW(step1[4] - step1[5], bd);
   step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
@@ -2535,12 +2498,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[15] = step1[15];
   temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
   temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
-  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
   temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[11] = step1[11];
   step2[12] = step1[12];
 
@@ -2570,8 +2533,8 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[4] = step2[4];
   temp1 = (step2[6] - step2[5]) * cospi_16_64;
   temp2 = (step2[5] + step2[6]) * cospi_16_64;
-  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[7] = step2[7];
 
   step1[8] = WRAPLOW(step2[8] + step2[11], bd);
@@ -2587,20 +2550,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[17] = step2[17];
   temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
   temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
-  step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
   temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
-  step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
   temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
   temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[22] = step2[22];
   step1[23] = step2[23];
   step1[24] = step2[24];
@@ -2621,12 +2584,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[9] = step1[9];
   temp1 = (-step1[10] + step1[13]) * cospi_16_64;
   temp2 = (step1[10] + step1[13]) * cospi_16_64;
-  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step1[11] + step1[12]) * cospi_16_64;
   temp2 = (step1[11] + step1[12]) * cospi_16_64;
-  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step2[14] = step1[14];
   step2[15] = step1[15];
 
@@ -2672,20 +2635,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[19] = step2[19];
   temp1 = (-step2[20] + step2[27]) * cospi_16_64;
   temp2 = (step2[20] + step2[27]) * cospi_16_64;
-  step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step2[21] + step2[26]) * cospi_16_64;
   temp2 = (step2[21] + step2[26]) * cospi_16_64;
-  step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step2[22] + step2[25]) * cospi_16_64;
   temp2 = (step2[22] + step2[25]) * cospi_16_64;
-  step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   temp1 = (-step2[23] + step2[24]) * cospi_16_64;
   temp2 = (step2[23] + step2[24]) * cospi_16_64;
-  step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd);
-  step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd);
+  step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd);
   step1[28] = step2[28];
   step1[29] = step2[29];
   step1[30] = step2[30];
@@ -2749,7 +2712,7 @@ void vp9_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
     if (zero_coeff[0] | zero_coeff[1])
       highbd_idct32(input, outptr, bd);
     else
-      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+      memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
     outptr += 32;
   }
@@ -2799,8 +2762,9 @@ void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
   int a1;
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  tran_low_t out = WRAPLOW(
+      highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd);
+  out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
index 12569b9dee9..6e2551dd4bc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.h
@@ -29,10 +29,12 @@ extern "C" {
 #define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
 
 #define pair_set_epi16(a, b) \
-  _mm_set_epi16(b, a, b, a, b, a, b, a)
+  _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
 #define dual_set_epi16(a, b) \
-  _mm_set_epi16(b, b, b, b, a, a, a, a)
+  _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
+                (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
 
 // Constants:
 //  for (int i = 1; i< 32; ++i)
@@ -78,13 +80,7 @@ static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;
 
 static INLINE tran_low_t check_range(tran_high_t input) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
-  // stay within the ranges:
-  // - 8 bit: signed 16 bit integer
-  // - 10 bit: signed 18 bit integer
-  // - 12 bit: signed 20 bit integer
-#elif CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
   // For valid VP9 input streams, intermediate stage coefficients should always
   // stay within the range of a signed 16 bit integer. Coefficients can go out
   // of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -93,7 +89,7 @@ static INLINE tran_low_t check_range(tran_high_t input) {
   // --enable-coefficient-range-checking.
   assert(INT16_MIN <= input);
   assert(input <= INT16_MAX);
-#endif
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
   return (tran_low_t)input;
 }
 
@@ -102,6 +98,32 @@ static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
   return check_range(rv);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t highbd_check_range(tran_high_t input,
+                                            int bd) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+  const int32_t int_max = (1 << (7 + bd)) - 1;
+  const int32_t int_min = -int_max - 1;
+  assert(int_min <= input);
+  assert(input <= int_max);
+  (void) int_min;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+  (void) bd;
+  return (tran_low_t)input;
+}
+
+static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input,
+                                                      int bd) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return highbd_check_range(rv, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
 
 typedef struct {
@@ -116,6 +138,28 @@ typedef struct {
 } highbd_transform_2d;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_EMULATE_HARDWARE
+// When CONFIG_EMULATE_HARDWARE is 1 the transform performs a
+// non-normative method to handle overflows. A stream that causes
+// overflows  in the inverse transform is considered invalid in VP9,
+// and a hardware implementer is free to choose any reasonable
+// method to handle overflows. However to aid in hardware
+// verification they can use a specific implementation of the
+// WRAPLOW() macro below that is identical to their intended
+// hardware implementation (and also use configure options to trigger
+// the C-implementation of the transform).
+//
+// The particular WRAPLOW implementation below performs strict
+// overflow wrapping to match common hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x, bd) ((((int32_t)(x)) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x, bd) (x)
+#endif  // CONFIG_EMULATE_HARDWARE
+
 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);
 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -135,6 +179,9 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                       int stride, int eob);
 
 #if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd);
+void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd);
 void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd);
 void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
@@ -151,6 +198,11 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
                            uint8_t *dest, int stride, int eob, int bd);
 void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
                              uint8_t *dest, int stride, int eob, int bd);
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+                                             int bd) {
+  trans = WRAPLOW(trans, bd);
+  return clip_pixel_highbd(WRAPLOW(dest + trans, bd), bd);
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
index aca8d7b33bd..69d393ef469 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.c
@@ -34,10 +34,10 @@
 //
 // A loopfilter should be applied to every other 8x8 horizontally.
 static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
-  0xffffffffffffffff,  // TX_4X4
-  0xffffffffffffffff,  // TX_8x8
-  0x5555555555555555,  // TX_16x16
-  0x1111111111111111,  // TX_32x32
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x5555555555555555ULL,  // TX_16x16
+  0x1111111111111111ULL,  // TX_32x32
 };
 
 // 64 bit masks for above transform size. Each 1 represents a position where
@@ -58,10 +58,10 @@ static const uint64_t left_64x64_txform_mask[TX_SIZES]= {
 //
 // A loopfilter should be applied to every other 4 the row vertically.
 static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
-  0xffffffffffffffff,  // TX_4X4
-  0xffffffffffffffff,  // TX_8x8
-  0x00ff00ff00ff00ff,  // TX_16x16
-  0x000000ff000000ff,  // TX_32x32
+  0xffffffffffffffffULL,  // TX_4X4
+  0xffffffffffffffffULL,  // TX_8x8
+  0x00ff00ff00ff00ffULL,  // TX_16x16
+  0x000000ff000000ffULL,  // TX_32x32
 };
 
 // 64 bit masks for prediction sizes (left). Each 1 represents a position
@@ -80,59 +80,59 @@ static const uint64_t above_64x64_txform_mask[TX_SIZES]= {
 //  00000000
 //  00000000
 static const uint64_t left_prediction_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4,
-  0x0000000000000001,  // BLOCK_4X8,
-  0x0000000000000001,  // BLOCK_8X4,
-  0x0000000000000001,  // BLOCK_8X8,
-  0x0000000000000101,  // BLOCK_8X16,
-  0x0000000000000001,  // BLOCK_16X8,
-  0x0000000000000101,  // BLOCK_16X16,
-  0x0000000001010101,  // BLOCK_16X32,
-  0x0000000000000101,  // BLOCK_32X16,
-  0x0000000001010101,  // BLOCK_32X32,
-  0x0101010101010101,  // BLOCK_32X64,
-  0x0000000001010101,  // BLOCK_64X32,
-  0x0101010101010101,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4,
+  0x0000000000000001ULL,  // BLOCK_4X8,
+  0x0000000000000001ULL,  // BLOCK_8X4,
+  0x0000000000000001ULL,  // BLOCK_8X8,
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000001ULL,  // BLOCK_16X8,
+  0x0000000000000101ULL,  // BLOCK_16X16,
+  0x0000000001010101ULL,  // BLOCK_16X32,
+  0x0000000000000101ULL,  // BLOCK_32X16,
+  0x0000000001010101ULL,  // BLOCK_32X32,
+  0x0101010101010101ULL,  // BLOCK_32X64,
+  0x0000000001010101ULL,  // BLOCK_64X32,
+  0x0101010101010101ULL,  // BLOCK_64X64
 };
 
 // 64 bit mask to shift and set for each prediction size.
 static const uint64_t above_prediction_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4
-  0x0000000000000001,  // BLOCK_4X8
-  0x0000000000000001,  // BLOCK_8X4
-  0x0000000000000001,  // BLOCK_8X8
-  0x0000000000000001,  // BLOCK_8X16,
-  0x0000000000000003,  // BLOCK_16X8
-  0x0000000000000003,  // BLOCK_16X16
-  0x0000000000000003,  // BLOCK_16X32,
-  0x000000000000000f,  // BLOCK_32X16,
-  0x000000000000000f,  // BLOCK_32X32,
-  0x000000000000000f,  // BLOCK_32X64,
-  0x00000000000000ff,  // BLOCK_64X32,
-  0x00000000000000ff,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000001ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000003ULL,  // BLOCK_16X16
+  0x0000000000000003ULL,  // BLOCK_16X32,
+  0x000000000000000fULL,  // BLOCK_32X16,
+  0x000000000000000fULL,  // BLOCK_32X32,
+  0x000000000000000fULL,  // BLOCK_32X64,
+  0x00000000000000ffULL,  // BLOCK_64X32,
+  0x00000000000000ffULL,  // BLOCK_64X64
 };
 // 64 bit mask to shift and set for each prediction size. A bit is set for
 // each 8x8 block that would be in the left most block of the given block
 // size in the 64x64 block.
 static const uint64_t size_mask[BLOCK_SIZES] = {
-  0x0000000000000001,  // BLOCK_4X4
-  0x0000000000000001,  // BLOCK_4X8
-  0x0000000000000001,  // BLOCK_8X4
-  0x0000000000000001,  // BLOCK_8X8
-  0x0000000000000101,  // BLOCK_8X16,
-  0x0000000000000003,  // BLOCK_16X8
-  0x0000000000000303,  // BLOCK_16X16
-  0x0000000003030303,  // BLOCK_16X32,
-  0x0000000000000f0f,  // BLOCK_32X16,
-  0x000000000f0f0f0f,  // BLOCK_32X32,
-  0x0f0f0f0f0f0f0f0f,  // BLOCK_32X64,
-  0x00000000ffffffff,  // BLOCK_64X32,
-  0xffffffffffffffff,  // BLOCK_64X64
+  0x0000000000000001ULL,  // BLOCK_4X4
+  0x0000000000000001ULL,  // BLOCK_4X8
+  0x0000000000000001ULL,  // BLOCK_8X4
+  0x0000000000000001ULL,  // BLOCK_8X8
+  0x0000000000000101ULL,  // BLOCK_8X16,
+  0x0000000000000003ULL,  // BLOCK_16X8
+  0x0000000000000303ULL,  // BLOCK_16X16
+  0x0000000003030303ULL,  // BLOCK_16X32,
+  0x0000000000000f0fULL,  // BLOCK_32X16,
+  0x000000000f0f0f0fULL,  // BLOCK_32X32,
+  0x0f0f0f0f0f0f0f0fULL,  // BLOCK_32X64,
+  0x00000000ffffffffULL,  // BLOCK_64X32,
+  0xffffffffffffffffULL,  // BLOCK_64X64
 };
 
 // These are used for masking the left and above borders.
-static const uint64_t left_border =  0x1111111111111111;
-static const uint64_t above_border = 0x000000ff000000ff;
+static const uint64_t left_border =  0x1111111111111111ULL;
+static const uint64_t above_border = 0x000000ff000000ffULL;
 
 // 16 bit masks for uv transform sizes.
 static const uint16_t left_64x64_txform_mask_uv[TX_SIZES]= {
@@ -222,9 +222,9 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
     if (block_inside_limit < 1)
       block_inside_limit = 1;
 
-    vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
-    vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
-               SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+           SIMD_WIDTH);
   }
 }
 
@@ -245,7 +245,7 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
 
   // init hev threshold const vectors
   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
-    vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+    memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
 }
 
 void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -276,7 +276,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
     if (!lf->mode_ref_delta_enabled) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
+      memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
     } else {
       int ref, mode;
       const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
@@ -293,7 +293,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
   }
 }
 
-static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
+static void filter_selectively_vert_row2(int subsampling_factor,
                                          uint8_t *s, int pitch,
                                          unsigned int mask_16x16_l,
                                          unsigned int mask_8x8_l,
@@ -301,9 +301,9 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
                                          unsigned int mask_4x4_int_l,
                                          const loop_filter_info_n *lfi_n,
                                          const uint8_t *lfl) {
-  const int mask_shift = plane_type ? 4 : 8;
-  const int mask_cutoff = plane_type ? 0xf : 0xff;
-  const int lfl_forward = plane_type ? 4 : 8;
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
 
   unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
   unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
@@ -393,7 +393,7 @@ static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_filter_selectively_vert_row2(PLANE_TYPE plane_type,
+static void highbd_filter_selectively_vert_row2(int subsampling_factor,
                                                 uint16_t *s, int pitch,
                                                 unsigned int mask_16x16_l,
                                                 unsigned int mask_8x8_l,
@@ -401,9 +401,9 @@ static void highbd_filter_selectively_vert_row2(PLANE_TYPE plane_type,
                                                 unsigned int mask_4x4_int_l,
                                                 const loop_filter_info_n *lfi_n,
                                                 const uint8_t *lfl, int bd) {
-  const int mask_shift = plane_type ? 4 : 8;
-  const int mask_cutoff = plane_type ? 0xf : 0xff;
-  const int lfl_forward = plane_type ? 4 : 8;
+  const int mask_shift = subsampling_factor ? 4 : 8;
+  const int mask_cutoff = subsampling_factor ? 0xf : 0xff;
+  const int lfl_forward = subsampling_factor ? 4 : 8;
 
   unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
   unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
@@ -727,7 +727,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
     const int h = num_8x8_blocks_high_lookup[block_size];
     int index = shift_y;
     for (i = 0; i < h; i++) {
-      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      memset(&lfm->lfl_y[index], filter_level, w);
       index += 8;
     }
   }
@@ -773,7 +773,7 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
   // an 8x8 in that the internal ones can be skipped and don't depend on
   // the prediction block size.
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 
   if (tx_size_uv == TX_4X4)
     *int_4x4_uv |= (size_mask_uv[block_size] & 0xffff) << shift_uv;
@@ -801,7 +801,7 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
     const int h = num_8x8_blocks_high_lookup[block_size];
     int index = shift_y;
     for (i = 0; i < h; i++) {
-      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      memset(&lfm->lfl_y[index], filter_level, w);
       index += 8;
     }
   }
@@ -819,19 +819,19 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
               left_64x64_txform_mask[tx_size_y]) << shift_y;
 
   if (tx_size_y == TX_4X4)
-    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffff) << shift_y;
+    *int_4x4_y |= (size_mask[block_size] & 0xffffffffffffffffULL) << shift_y;
 }
 
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 // TODO(JBB): This function only works for yv12.
 void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                    MODE_INFO *mi, const int mode_info_stride,
+                    MODE_INFO **mi, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm) {
   int idx_32, idx_16, idx_8;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO *mip = mi;
-  MODE_INFO *mip2 = mi;
+  MODE_INFO **mip = mi;
+  MODE_INFO **mip2 = mi;
 
   // These are offsets to the next mi in the 64x64 block. It is what gets
   // added to the mi ptr as we go through each loop. It helps us to avoid
@@ -859,28 +859,28 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                         cm->mi_cols - mi_col : MI_BLOCK_SIZE);
 
   vp9_zero(*lfm);
-  assert(mip != NULL);
+  assert(mip[0] != NULL);
 
   // TODO(jimbankoski): Try moving most of the following code into decode
   // loop and storing lfm in the mbmi structure so that we don't have to go
   // through the recursive loop structure multiple times.
-  switch (mip->mbmi.sb_type) {
+  switch (mip[0]->mbmi.sb_type) {
     case BLOCK_64X64:
-      build_masks(lfi_n, mip , 0, 0, lfm);
+      build_masks(lfi_n, mip[0] , 0, 0, lfm);
       break;
     case BLOCK_64X32:
-      build_masks(lfi_n, mip, 0, 0, lfm);
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
       mip2 = mip + mode_info_stride * 4;
       if (4 >= max_rows)
         break;
-      build_masks(lfi_n, mip2, 32, 8, lfm);
+      build_masks(lfi_n, mip2[0], 32, 8, lfm);
       break;
     case BLOCK_32X64:
-      build_masks(lfi_n, mip, 0, 0, lfm);
+      build_masks(lfi_n, mip[0], 0, 0, lfm);
       mip2 = mip + 4;
       if (4 >= max_cols)
         break;
-      build_masks(lfi_n, mip2, 4, 2, lfm);
+      build_masks(lfi_n, mip2[0], 4, 2, lfm);
       break;
     default:
       for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
@@ -890,23 +890,23 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
         const int mi_32_row_offset = ((idx_32 >> 1) << 2);
         if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
           continue;
-        switch (mip->mbmi.sb_type) {
+        switch (mip[0]->mbmi.sb_type) {
           case BLOCK_32X32:
-            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
             break;
           case BLOCK_32X16:
-            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
             if (mi_32_row_offset + 2 >= max_rows)
               continue;
             mip2 = mip + mode_info_stride * 2;
-            build_masks(lfi_n, mip2, shift_y + 16, shift_uv + 4, lfm);
+            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
             break;
           case BLOCK_16X32:
-            build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
             if (mi_32_col_offset + 2 >= max_cols)
               continue;
             mip2 = mip + 2;
-            build_masks(lfi_n, mip2, shift_y + 2, shift_uv + 1, lfm);
+            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
             break;
           default:
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
@@ -920,29 +920,29 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
               if (mi_16_col_offset >= max_cols || mi_16_row_offset >= max_rows)
                 continue;
 
-              switch (mip->mbmi.sb_type) {
+              switch (mip[0]->mbmi.sb_type) {
                 case BLOCK_16X16:
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   break;
                 case BLOCK_16X8:
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   if (mi_16_row_offset + 1 >= max_rows)
                     continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2, shift_y+8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
                   break;
                 case BLOCK_8X16:
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   if (mi_16_col_offset +1 >= max_cols)
                     continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2, shift_y+1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
                   break;
                 default: {
                   const int shift_y = shift_32_y[idx_32] +
                                       shift_16_y[idx_16] +
                                       shift_8_y[0];
-                  build_masks(lfi_n, mip, shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   mip += offset[0];
                   for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
                     const int shift_y = shift_32_y[idx_32] +
@@ -956,7 +956,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip, shift_y, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
                   }
                   break;
                 }
@@ -968,7 +968,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       break;
   }
   // The largest loopfilter we have is 16x16 so we use the 16x16 mask
-  // for 32x32 transforms also also.
+  // for 32x32 transforms also.
   lfm->left_y[TX_16X16] |= lfm->left_y[TX_32X32];
   lfm->above_y[TX_16X16] |= lfm->above_y[TX_32X32];
   lfm->left_uv[TX_16X16] |= lfm->left_uv[TX_32X32];
@@ -1021,7 +1021,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 
     // Each pixel inside the border gets a 1, the multiply copies the border
     // to where we need it.
-    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101;
+    const uint64_t mask_y  = (((1 << columns) - 1)) * 0x0101010101010101ULL;
     const uint16_t mask_uv = ((1 << ((columns + 1) >> 1)) - 1) * 0x1111;
 
     // Internal edges are not applied on the last column of the image so
@@ -1053,7 +1053,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
   // out.
   if (mi_col == 0) {
     for (i = 0; i < TX_32X32; i++) {
-      lfm->left_y[i] &= 0xfefefefefefefefe;
+      lfm->left_y[i] &= 0xfefefefefefefefeULL;
       lfm->left_uv[i] &= 0xeeee;
     }
   }
@@ -1149,10 +1149,10 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static void filter_block_plane_non420(VP9_COMMON *cm,
-                                      struct macroblockd_plane *plane,
-                                      MODE_INFO *mi_8x8,
-                                      int mi_row, int mi_col) {
+void vp9_filter_block_plane_non420(VP9_COMMON *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
@@ -1175,7 +1175,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
 
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c].src_mi;
+      const MODE_INFO *mi = mi_8x8[c];
       const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
       const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
       // left edge of current unit is block/partition edge -> no skip
@@ -1326,248 +1326,203 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
   }
 }
 
-void vp9_filter_block_plane(VP9_COMMON *const cm,
-                            struct macroblockd_plane *const plane,
-                            int mi_row,
-                            LOOP_FILTER_MASK *lfm) {
+void vp9_filter_block_plane_ss00(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
   struct buf_2d *const dst = &plane->dst;
-  uint8_t* const dst0 = dst->buf;
-  int r, c;
+  uint8_t *const dst0 = dst->buf;
+  int r;
+  uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+  uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+  uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+  uint64_t mask_4x4_int = lfm->int_4x4_y;
+
+  assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+    unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+    unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+    unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+
+// Disable filtering on the leftmost column.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_vert_row2(
+          plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+    } else {
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+    }
+#else
+    filter_selectively_vert_row2(
+        plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst->buf += 16 * dst->stride;
+    mask_16x16 >>= 16;
+    mask_8x8 >>= 16;
+    mask_4x4 >>= 16;
+    mask_4x4_int >>= 16;
+  }
 
-  if (!plane->plane_type) {
-    uint64_t mask_16x16 = lfm->left_y[TX_16X16];
-    uint64_t mask_8x8 = lfm->left_y[TX_8X8];
-    uint64_t mask_4x4 = lfm->left_y[TX_4X4];
-    uint64_t mask_4x4_int = lfm->int_4x4_y;
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_y[TX_16X16];
+  mask_8x8 = lfm->above_y[TX_8X8];
+  mask_4x4 = lfm->above_y[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_y;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
 
-    // Vertical pass: do 2 rows at one time
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-      unsigned int mask_16x16_l = mask_16x16 & 0xffff;
-      unsigned int mask_8x8_l = mask_8x8 & 0xffff;
-      unsigned int mask_4x4_l = mask_4x4 & 0xffff;
-      unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xff;
+      mask_8x8_r = mask_8x8 & 0xff;
+      mask_4x4_r = mask_4x4 & 0xff;
+    }
 
-      // Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_vert_row2(plane->plane_type,
-                                            CONVERT_TO_SHORTPTR(dst->buf),
-                                            dst->stride,
-                                            mask_16x16_l,
-                                            mask_8x8_l,
-                                            mask_4x4_l,
-                                            mask_4x4_int_l,
-                                            &cm->lf_info, &lfm->lfl_y[r << 3],
-                                            (int)cm->bit_depth);
-      } else {
-        filter_selectively_vert_row2(plane->plane_type,
-                                     dst->buf, dst->stride,
-                                     mask_16x16_l,
-                                     mask_8x8_l,
-                                     mask_4x4_l,
-                                     mask_4x4_int_l,
-                                     &cm->lf_info,
-                                     &lfm->lfl_y[r << 3]);
-      }
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                               &lfm->lfl_y[r << 3]);
+    }
 #else
-      filter_selectively_vert_row2(plane->plane_type,
-                                   dst->buf, dst->stride,
-                                   mask_16x16_l,
-                                   mask_8x8_l,
-                                   mask_4x4_l,
-                                   mask_4x4_int_l,
-                                   &cm->lf_info, &lfm->lfl_y[r << 3]);
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+                             &lfm->lfl_y[r << 3]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 16;
-      mask_8x8 >>= 16;
-      mask_4x4 >>= 16;
-      mask_4x4_int >>= 16;
-    }
 
-    // Horizontal pass
-    dst->buf = dst0;
-    mask_16x16 = lfm->above_y[TX_16X16];
-    mask_8x8 = lfm->above_y[TX_8X8];
-    mask_4x4 = lfm->above_y[TX_4X4];
-    mask_4x4_int = lfm->int_4x4_y;
-
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
-      unsigned int mask_16x16_r;
-      unsigned int mask_8x8_r;
-      unsigned int mask_4x4_r;
-
-      if (mi_row + r == 0) {
-        mask_16x16_r = 0;
-        mask_8x8_r = 0;
-        mask_4x4_r = 0;
-      } else {
-        mask_16x16_r = mask_16x16 & 0xff;
-        mask_8x8_r = mask_8x8 & 0xff;
-        mask_4x4_r = mask_4x4 & 0xff;
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 8;
+    mask_8x8 >>= 8;
+    mask_4x4 >>= 8;
+    mask_4x4_int >>= 8;
+  }
+}
+
+void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm) {
+  struct buf_2d *const dst = &plane->dst;
+  uint8_t *const dst0 = dst->buf;
+  int r, c;
+
+  uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+  uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+  uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+  uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+  assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+
+  // Vertical pass: do 2 rows at one time
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+    if (plane->plane_type == 1) {
+      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
       }
+    }
+
+    {
+      unsigned int mask_16x16_l = mask_16x16 & 0xff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
 
+// Disable filtering on the leftmost column.
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride,
-                                        mask_16x16_r,
-                                        mask_8x8_r,
-                                        mask_4x4_r,
-                                        mask_4x4_int & 0xff,
-                                        &cm->lf_info,
-                                        &lfm->lfl_y[r << 3],
-                                        (int)cm->bit_depth);
+        highbd_filter_selectively_vert_row2(
+            plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
       } else {
-        filter_selectively_horiz(dst->buf, dst->stride,
-                                 mask_16x16_r,
-                                 mask_8x8_r,
-                                 mask_4x4_r,
-                                 mask_4x4_int & 0xff,
-                                 &cm->lf_info,
-                                 &lfm->lfl_y[r << 3]);
+        filter_selectively_vert_row2(
+            plane->subsampling_x, dst->buf, dst->stride,
+            mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+            &lfm->lfl_uv[r << 1]);
       }
 #else
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               mask_16x16_r,
-                               mask_8x8_r,
-                               mask_4x4_r,
-                               mask_4x4_int & 0xff,
-                               &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+      filter_selectively_vert_row2(
+          plane->subsampling_x, dst->buf, dst->stride,
+          mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      dst->buf += 8 * dst->stride;
+      dst->buf += 16 * dst->stride;
       mask_16x16 >>= 8;
       mask_8x8 >>= 8;
       mask_4x4 >>= 8;
       mask_4x4_int >>= 8;
     }
-  } else {
-    uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
-    uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
-    uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
-    uint16_t mask_4x4_int = lfm->int_4x4_uv;
-
-    // Vertical pass: do 2 rows at one time
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-      if (plane->plane_type == 1) {
-        for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-          lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-          lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) +
-                                                       (c << 1)];
-        }
-      }
-
-      {
-        unsigned int mask_16x16_l = mask_16x16 & 0xff;
-        unsigned int mask_8x8_l = mask_8x8 & 0xff;
-        unsigned int mask_4x4_l = mask_4x4 & 0xff;
-        unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+  }
 
-        // Disable filtering on the leftmost column.
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          highbd_filter_selectively_vert_row2(plane->plane_type,
-                                              CONVERT_TO_SHORTPTR(dst->buf),
-                                              dst->stride,
-                                              mask_16x16_l,
-                                              mask_8x8_l,
-                                              mask_4x4_l,
-                                              mask_4x4_int_l,
-                                              &cm->lf_info,
-                                              &lfm->lfl_uv[r << 1],
-                                              (int)cm->bit_depth);
-        } else {
-          filter_selectively_vert_row2(plane->plane_type,
-                                       dst->buf, dst->stride,
-                                       mask_16x16_l,
-                                       mask_8x8_l,
-                                       mask_4x4_l,
-                                       mask_4x4_int_l,
-                                       &cm->lf_info,
-                                       &lfm->lfl_uv[r << 1]);
-        }
-#else
-        filter_selectively_vert_row2(plane->plane_type,
-                                     dst->buf, dst->stride,
-                                     mask_16x16_l,
-                                     mask_8x8_l,
-                                     mask_4x4_l,
-                                     mask_4x4_int_l,
-                                     &cm->lf_info,
-                                     &lfm->lfl_uv[r << 1]);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+  // Horizontal pass
+  dst->buf = dst0;
+  mask_16x16 = lfm->above_uv[TX_16X16];
+  mask_8x8 = lfm->above_uv[TX_8X8];
+  mask_4x4 = lfm->above_uv[TX_4X4];
+  mask_4x4_int = lfm->int_4x4_uv;
+
+  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+    const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+    const unsigned int mask_4x4_int_r =
+        skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
 
-        dst->buf += 16 * dst->stride;
-        mask_16x16 >>= 8;
-        mask_8x8 >>= 8;
-        mask_4x4 >>= 8;
-        mask_4x4_int >>= 8;
-      }
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16 & 0xf;
+      mask_8x8_r = mask_8x8 & 0xf;
+      mask_4x4_r = mask_4x4 & 0xf;
     }
 
-    // Horizontal pass
-    dst->buf = dst0;
-    mask_16x16 = lfm->above_uv[TX_16X16];
-    mask_8x8 = lfm->above_uv[TX_8X8];
-    mask_4x4 = lfm->above_uv[TX_4X4];
-    mask_4x4_int = lfm->int_4x4_uv;
-
-    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
-      const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
-      const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
-          0 : (mask_4x4_int & 0xf);
-      unsigned int mask_16x16_r;
-      unsigned int mask_8x8_r;
-      unsigned int mask_4x4_r;
-
-      if (mi_row + r == 0) {
-        mask_16x16_r = 0;
-        mask_8x8_r = 0;
-        mask_4x4_r = 0;
-      } else {
-        mask_16x16_r = mask_16x16 & 0xf;
-        mask_8x8_r = mask_8x8 & 0xf;
-        mask_4x4_r = mask_4x4 & 0xf;
-      }
-
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                        dst->stride,
-                                        mask_16x16_r,
-                                        mask_8x8_r,
-                                        mask_4x4_r,
-                                        mask_4x4_int_r,
-                                        &cm->lf_info,
-                                        &lfm->lfl_uv[r << 1],
-                                        (int)cm->bit_depth);
-      } else {
-        filter_selectively_horiz(dst->buf, dst->stride,
-                                 mask_16x16_r,
-                                 mask_8x8_r,
-                                 mask_4x4_r,
-                                 mask_4x4_int_r,
-                                 &cm->lf_info,
-                                 &lfm->lfl_uv[r << 1]);
-      }
-#else
-      filter_selectively_horiz(dst->buf, dst->stride,
-                               mask_16x16_r,
-                               mask_8x8_r,
-                               mask_4x4_r,
-                               mask_4x4_int_r,
-                               &cm->lf_info,
+    if (cm->use_highbitdepth) {
+      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+                                      dst->stride, mask_16x16_r, mask_8x8_r,
+                                      mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+    } else {
+      filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                               mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
                                &lfm->lfl_uv[r << 1]);
+    }
+#else
+    filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
+                             mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
+                             &lfm->lfl_uv[r << 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      dst->buf += 8 * dst->stride;
-      mask_16x16 >>= 4;
-      mask_8x8 >>= 4;
-      mask_4x4 >>= 4;
-      mask_4x4_int >>= 4;
-    }
+    dst->buf += 8 * dst->stride;
+    mask_16x16 >>= 4;
+    mask_8x8 >>= 4;
+    mask_4x4 >>= 4;
+    mask_4x4_int >>= 4;
   }
 }
 
@@ -1576,13 +1531,21 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int use_420 = y_only || (planes[1].subsampling_y == 1 &&
-                                 planes[1].subsampling_x == 1);
+  enum lf_path path;
   LOOP_FILTER_MASK lfm;
   int mi_row, mi_col;
 
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
   for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
-    MODE_INFO *mi = cm->mi + mi_row * cm->mi_stride;
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 
     for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
       int plane;
@@ -1590,16 +1553,23 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
       vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-      if (use_420)
-        vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                       &lfm);
-
-      for (plane = 0; plane < num_planes; ++plane) {
-        if (use_420)
-          vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
-        else
-          filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
-                                    mi_row, mi_col);
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
+
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
       }
     }
   }
@@ -1625,6 +1595,17 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
                        y_only);
 }
 
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]) {
+  lf_data->frame_buffer = frame_buffer;
+  lf_data->cm = cm;
+  lf_data->start = 0;
+  lf_data->stop = 0;
+  lf_data->y_only = 0;
+  memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
+}
+
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
   (void)unused;
   vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
index 0ede58ae481..f7cbde678de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_loopfilter.h
@@ -29,6 +29,12 @@ extern "C" {
 #define MAX_REF_LF_DELTAS       4
 #define MAX_MODE_LF_DELTAS      2
 
+enum lf_path {
+  LF_PATH_420,
+  LF_PATH_444,
+  LF_PATH_SLOW,
+};
+
 struct loopfilter {
   int filter_level;
 
@@ -89,13 +95,23 @@ struct VP9LfSyncData;
 // by mi_row, mi_col.
 void vp9_setup_mask(struct VP9Common *const cm,
                     const int mi_row, const int mi_col,
-                    MODE_INFO *mi_8x8, const int mode_info_stride,
+                    MODE_INFO **mi_8x8, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm);
 
-void vp9_filter_block_plane(struct VP9Common *const cm,
-                            struct macroblockd_plane *const plane,
-                            int mi_row,
-                            LOOP_FILTER_MASK *lfm);
+void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_ss11(struct VP9Common *const cm,
+                                 struct macroblockd_plane *const plane,
+                                 int mi_row,
+                                 LOOP_FILTER_MASK *lfm);
+
+void vp9_filter_block_plane_non420(struct VP9Common *cm,
+                                   struct macroblockd_plane *plane,
+                                   MODE_INFO **mi_8x8,
+                                   int mi_row, int mi_col);
 
 void vp9_loop_filter_init(struct VP9Common *cm);
 
@@ -124,11 +140,12 @@ typedef struct LoopFilterWorkerData {
   int start;
   int stop;
   int y_only;
-
-  struct VP9LfSyncData *lf_sync;
-  int num_lf_workers;
 } LFWorkerData;
 
+void vp9_loop_filter_data_reset(
+    LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
+    struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
+
 // Operates on the rows described by 'lf_data'.
 int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
 #ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c
new file mode 100644
index 00000000000..57189df16ec
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.c
@@ -0,0 +1,394 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_postproc.h"
+
+// TODO(jackychen): Replace this function with SSE2 code. There is
+// one SSE2 implementation in vp8, so will consider how to share it
+// between vp8 and vp9.
+static void filter_by_weight(const uint8_t *src, int src_stride,
+                             uint8_t *dst, int dst_stride,
+                             int block_size, int src_weight) {
+  const int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+  const int rounding_bit = 1 << (MFQE_PRECISION - 1);
+  int r, c;
+
+  for (r = 0; r < block_size; r++) {
+    for (c = 0; c < block_size; c++) {
+      dst[c] = (src[c] * src_weight + dst[c] * dst_weight + rounding_bit)
+               >> MFQE_PRECISION;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride, int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 int src_weight) {
+  filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+static void filter_by_weight32x32(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  vp9_filter_by_weight16x16(src, src_stride, dst, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + 16, src_stride, dst + 16, dst_stride,
+                            weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16, src_stride,
+                            dst + dst_stride * 16, dst_stride, weight);
+  vp9_filter_by_weight16x16(src + src_stride * 16 + 16, src_stride,
+                            dst + dst_stride * 16 + 16, dst_stride, weight);
+}
+
+static void filter_by_weight64x64(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int weight) {
+  filter_by_weight32x32(src, src_stride, dst, dst_stride, weight);
+  filter_by_weight32x32(src + 32, src_stride, dst + 32,
+                        dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32, src_stride,
+                        dst + dst_stride * 32, dst_stride, weight);
+  filter_by_weight32x32(src + src_stride * 32 + 32, src_stride,
+                        dst + dst_stride * 32 + 32, dst_stride, weight);
+}
+
+static void apply_ifactor(const uint8_t *y, int y_stride, uint8_t *yd,
+                          int yd_stride, const uint8_t *u, const uint8_t *v,
+                          int uv_stride, uint8_t *ud, uint8_t *vd,
+                          int uvd_stride, BLOCK_SIZE block_size,
+                          int weight) {
+  if (block_size == BLOCK_16X16) {
+    vp9_filter_by_weight16x16(y, y_stride, yd, yd_stride, weight);
+    vp9_filter_by_weight8x8(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight8x8(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_32X32) {
+    filter_by_weight32x32(y, y_stride, yd, yd_stride, weight);
+    vp9_filter_by_weight16x16(u, uv_stride, ud, uvd_stride, weight);
+    vp9_filter_by_weight16x16(v, uv_stride, vd, uvd_stride, weight);
+  } else if (block_size == BLOCK_64X64) {
+    filter_by_weight64x64(y, y_stride, yd, yd_stride, weight);
+    filter_by_weight32x32(u, uv_stride, ud, uvd_stride, weight);
+    filter_by_weight32x32(v, uv_stride, vd, uvd_stride, weight);
+  }
+}
+
+// TODO(jackychen): Determine whether replace it with assembly code.
+static void copy_mem8x8(const uint8_t *src, int src_stride,
+                        uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 8; r++) {
+    memcpy(dst, src, 8);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem16x16(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  int r;
+  for (r = 0; r < 16; r++) {
+    memcpy(dst, src, 16);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void copy_mem32x32(const uint8_t *src, int src_stride,
+                          uint8_t *dst, int dst_stride) {
+  copy_mem16x16(src, src_stride, dst, dst_stride);
+  copy_mem16x16(src + 16, src_stride, dst + 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16, src_stride,
+                dst + dst_stride * 16, dst_stride);
+  copy_mem16x16(src + src_stride * 16 + 16, src_stride,
+                dst + dst_stride * 16 + 16, dst_stride);
+}
+
+void copy_mem64x64(const uint8_t *src, int src_stride,
+                   uint8_t *dst, int dst_stride) {
+  copy_mem32x32(src, src_stride, dst, dst_stride);
+  copy_mem32x32(src + 32, src_stride, dst + 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32, src_stride,
+                dst + src_stride * 32, dst_stride);
+  copy_mem32x32(src + src_stride * 32 + 32, src_stride,
+                dst + src_stride * 32 + 32, dst_stride);
+}
+
+static void copy_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                       int y_stride, int uv_stride, uint8_t *yd, uint8_t *ud,
+                       uint8_t *vd, int yd_stride, int uvd_stride,
+                       BLOCK_SIZE bs) {
+  if (bs == BLOCK_16X16) {
+    copy_mem16x16(y, y_stride, yd, yd_stride);
+    copy_mem8x8(u, uv_stride, ud, uvd_stride);
+    copy_mem8x8(v, uv_stride, vd, uvd_stride);
+  } else if (bs == BLOCK_32X32) {
+    copy_mem32x32(y, y_stride, yd, yd_stride);
+    copy_mem16x16(u, uv_stride, ud, uvd_stride);
+    copy_mem16x16(v, uv_stride, vd, uvd_stride);
+  } else {
+    copy_mem64x64(y, y_stride, yd, yd_stride);
+    copy_mem32x32(u, uv_stride, ud, uvd_stride);
+    copy_mem32x32(v, uv_stride, vd, uvd_stride);
+  }
+}
+
+static void get_thr(BLOCK_SIZE bs, int qdiff, int *sad_thr, int *vdiff_thr) {
+  const int adj = qdiff >> MFQE_PRECISION;
+  if (bs == BLOCK_16X16) {
+    *sad_thr = 7 + adj;
+  } else if (bs == BLOCK_32X32) {
+    *sad_thr = 6 + adj;
+  } else {  // BLOCK_64X64
+    *sad_thr = 5 + adj;
+  }
+  *vdiff_thr = 125 + qdiff;
+}
+
+static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
+                       const uint8_t *v, int y_stride, int uv_stride,
+                       uint8_t *yd, uint8_t *ud, uint8_t *vd, int yd_stride,
+                       int uvd_stride, int qdiff) {
+  int sad, sad_thr, vdiff, vdiff_thr;
+  uint32_t sse;
+
+  get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
+
+  if (bs == BLOCK_16X16) {
+    vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
+  } else if (bs == BLOCK_32X32) {
+    vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
+  } else /* if (bs == BLOCK_64X64) */ {
+    vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
+  }
+
+  // vdiff > sad * 3 means vdiff should not be too small, otherwise,
+  // it might be a lighting change in smooth area. When there is a
+  // lighting change in smooth area, it is dangerous to do MFQE.
+  if (sad > 1 && vdiff > sad * 3) {
+    const int weight = 1 << MFQE_PRECISION;
+    int ifactor = weight * sad * vdiff / (sad_thr * vdiff_thr);
+    // When ifactor equals weight, no MFQE is done.
+    if (ifactor > weight) {
+      ifactor = weight;
+    }
+    apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
+                  uvd_stride, bs, ifactor);
+  } else {
+    // Copy the block from current frame (i.e., no mfqe is done).
+    copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+               yd_stride, uvd_stride, bs);
+  }
+}
+
+static int mfqe_decision(MODE_INFO *mi, BLOCK_SIZE cur_bs) {
+  // Check the motion in current block(for inter frame),
+  // or check the motion in the correlated block in last frame (for keyframe).
+  const int mv_len_square = mi->mbmi.mv[0].as_mv.row *
+                            mi->mbmi.mv[0].as_mv.row +
+                            mi->mbmi.mv[0].as_mv.col *
+                            mi->mbmi.mv[0].as_mv.col;
+  const int mv_threshold = 100;
+  return mi->mbmi.mode >= NEARESTMV &&  // Not an intra block
+         cur_bs >= BLOCK_16X16 &&
+         mv_len_square <= mv_threshold;
+}
+
+// Process each partiton in a super block, recursively.
+static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
+                           const uint8_t *y, const uint8_t *u,
+                           const uint8_t *v, int y_stride, int uv_stride,
+                           uint8_t *yd, uint8_t *ud, uint8_t *vd,
+                           int yd_stride, int uvd_stride) {
+  int mi_offset, y_offset, uv_offset;
+  const BLOCK_SIZE cur_bs = mi->mbmi.sb_type;
+  const int qdiff = cm->base_qindex - cm->postproc_state.last_base_qindex;
+  const int bsl = b_width_log2_lookup[bs];
+  PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
+  const BLOCK_SIZE subsize = get_subsize(bs, partition);
+
+  if (cur_bs < BLOCK_8X8) {
+    // If there are blocks smaller than 8x8, it must be on the boundary.
+    return;
+  }
+  // No MFQE on blocks smaller than 16x16
+  if (bs == BLOCK_16X16) {
+    partition = PARTITION_NONE;
+  }
+  if (bs == BLOCK_64X64) {
+    mi_offset = 4;
+    y_offset = 32;
+    uv_offset = 16;
+  } else {
+    mi_offset = 2;
+    y_offset = 16;
+    uv_offset = 8;
+  }
+  switch (partition) {
+    BLOCK_SIZE mfqe_bs, bs_tmp;
+    case PARTITION_HORZ:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_64X32;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_32X16;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset * cm->mi_stride, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_VERT:
+      if (bs == BLOCK_64X64) {
+        mfqe_bs = BLOCK_32X64;
+        bs_tmp = BLOCK_32X32;
+      } else {
+        mfqe_bs = BLOCK_16X32;
+        bs_tmp = BLOCK_16X16;
+      }
+      if (mfqe_decision(mi, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride, u + uv_offset * uv_stride,
+                   v + uv_offset * uv_stride, y_stride, uv_stride,
+                   yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                   vd + uv_offset * uvd_stride, yd_stride, uvd_stride, qdiff);
+      }
+      if (mfqe_decision(mi + mi_offset, mfqe_bs)) {
+        // Do mfqe on the first square partition.
+        mfqe_block(bs_tmp, y + y_offset, u + uv_offset, v + uv_offset,
+                   y_stride, uv_stride, yd + y_offset, ud + uv_offset,
+                   vd + uv_offset, yd_stride, uvd_stride, qdiff);
+        // Do mfqe on the second square partition.
+        mfqe_block(bs_tmp, y + y_offset * y_stride + y_offset,
+                   u + uv_offset * uv_stride + uv_offset,
+                   v + uv_offset * uv_stride + uv_offset, y_stride,
+                   uv_stride, yd + y_offset * yd_stride + y_offset,
+                   ud + uv_offset * uvd_stride + uv_offset,
+                   vd + uv_offset * uvd_stride + uv_offset,
+                   yd_stride, uvd_stride, qdiff);
+      }
+      break;
+    case PARTITION_NONE:
+      if (mfqe_decision(mi, cur_bs)) {
+        // Do mfqe on this partition.
+        mfqe_block(cur_bs, y, u, v, y_stride, uv_stride,
+                   yd, ud, vd, yd_stride, uvd_stride, qdiff);
+      } else {
+        // Copy the block from current frame(i.e., no mfqe is done).
+        copy_block(y, u, v, y_stride, uv_stride, yd, ud, vd,
+                   yd_stride, uvd_stride, bs);
+      }
+      break;
+    case PARTITION_SPLIT:
+      // Recursion on four square partitions, e.g. if bs is 64X64,
+      // then look into four 32X32 blocks in it.
+      mfqe_partition(cm, mi, subsize, y, u, v, y_stride, uv_stride, yd, ud, vd,
+                     yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset, subsize, y + y_offset, u + uv_offset,
+                     v + uv_offset, y_stride, uv_stride, yd + y_offset,
+                     ud + uv_offset, vd + uv_offset, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride, subsize,
+                     y + y_offset * y_stride, u + uv_offset * uv_stride,
+                     v + uv_offset * uv_stride, y_stride, uv_stride,
+                     yd + y_offset * yd_stride, ud + uv_offset * uvd_stride,
+                     vd + uv_offset * uvd_stride, yd_stride, uvd_stride);
+      mfqe_partition(cm, mi + mi_offset * cm->mi_stride + mi_offset,
+                     subsize, y + y_offset * y_stride + y_offset,
+                     u + uv_offset * uv_stride + uv_offset,
+                     v + uv_offset * uv_stride + uv_offset, y_stride,
+                     uv_stride, yd + y_offset * yd_stride + y_offset,
+                     ud + uv_offset * uvd_stride + uv_offset,
+                     vd + uv_offset * uvd_stride + uv_offset,
+                     yd_stride, uvd_stride);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+void vp9_mfqe(VP9_COMMON *cm) {
+  int mi_row, mi_col;
+  // Current decoded frame.
+  const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+  // Last decoded frame and will store the MFQE result.
+  YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+  // Loop through each super block.
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      MODE_INFO *mi;
+      MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
+      // Motion Info in last frame.
+      MODE_INFO *mi_prev = cm->postproc_state.prev_mi +
+                           (mi_row * cm->mi_stride + mi_col);
+      const uint32_t y_stride = show->y_stride;
+      const uint32_t uv_stride = show->uv_stride;
+      const uint32_t yd_stride = dest->y_stride;
+      const uint32_t uvd_stride = dest->uv_stride;
+      const uint32_t row_offset_y = mi_row << 3;
+      const uint32_t row_offset_uv = mi_row << 2;
+      const uint32_t col_offset_y = mi_col << 3;
+      const uint32_t col_offset_uv = mi_col << 2;
+      const uint8_t *y = show->y_buffer + row_offset_y * y_stride +
+                         col_offset_y;
+      const uint8_t *u = show->u_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      const uint8_t *v = show->v_buffer + row_offset_uv * uv_stride +
+                         col_offset_uv;
+      uint8_t *yd = dest->y_buffer + row_offset_y * yd_stride + col_offset_y;
+      uint8_t *ud = dest->u_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      uint8_t *vd = dest->v_buffer + row_offset_uv * uvd_stride +
+                    col_offset_uv;
+      if (frame_is_intra_only(cm)) {
+        mi = mi_prev;
+      } else {
+        mi = mi_local;
+      }
+      mfqe_partition(cm, mi, BLOCK_64X64, y, u, v, y_stride, uv_stride, yd, ud,
+                     vd, yd_stride, uvd_stride);
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.h
new file mode 100644
index 00000000000..dfff8c23d65
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mfqe.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_MFQE_H_
+#define VP9_COMMON_VP9_MFQE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Multiframe Quality Enhancement.
+// The aim for MFQE is to replace pixel blocks in the current frame with
+// the correlated pixel blocks (with higher quality) in the last frame.
+// The replacement can only be taken in stationary blocks by checking
+// the motion of the blocks and other conditions such as the SAD of
+// the current block and correlated block, the variance of the block
+// difference, etc.
+void vp9_mfqe(struct VP9Common *cm);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_COMMON_VP9_MFQE_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
index 3b34050a840..51e147e0056 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.c
@@ -17,19 +17,18 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              const TileInfo *const tile,
                              MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                              int_mv *mv_ref_list,
-                             int block, int mi_row, int mi_col) {
+                             int block, int mi_row, int mi_col,
+                             find_mv_refs_sync sync, void *const data) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
-  const MODE_INFO *prev_mi = !cm->error_resilient_mode && cm->prev_mi
-        ? cm->prev_mi[mi_row * xd->mi_stride + mi_col].src_mi
-        : NULL;
-  const MB_MODE_INFO *const prev_mbmi = prev_mi ? &prev_mi->src_mi->mbmi : NULL;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
   int different_ref_found = 0;
   int context_counter = 0;
+  const MV_REF *const  prev_frame_mvs = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
 
   // Blank the reference vector list
-  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
 
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
@@ -38,16 +37,18 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
-                                                   xd->mi_stride].src_mi;
+                                                   xd->mi_stride];
       const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
       // Keep counts for entropy encoding.
       context_counter += mode_2_counter[candidate->mode];
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block));
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, Done);
       else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block));
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, Done);
     }
   }
 
@@ -58,22 +59,38 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
-                                                    xd->mi_stride].src_mi->mbmi;
+                                                    xd->mi_stride]->mbmi;
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0]);
+        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
       else if (candidate->ref_frame[1] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[1]);
+        ADD_MV_REF_LIST(candidate->mv[1], refmv_count, mv_ref_list, Done);
     }
   }
 
+  // TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
+  // on windows platform. The sync here is unncessary if use_perv_frame_mvs
+  // is 0. But after removing it, there will be hang in the unit test on windows
+  // due to several threads waiting for a thread's signal.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+#endif
+
   // Check the last frame's mode and mv info.
-  if (prev_mbmi) {
-    if (prev_mbmi->ref_frame[0] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[0]);
-    else if (prev_mbmi->ref_frame[1] == ref_frame)
-      ADD_MV_REF_LIST(prev_mbmi->mv[1]);
+  if (cm->use_prev_frame_mvs) {
+    // Synchronize here for frame parallel decode if sync function is provided.
+    if (cm->frame_parallel_decode && sync != NULL) {
+      sync(data, mi_row);
+    }
+
+    if (prev_frame_mvs->ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
+    } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(prev_frame_mvs->mv[1], refmv_count, mv_ref_list, Done);
+    }
   }
 
   // Since we couldn't find 2 mvs from the same reference frame
@@ -84,17 +101,40 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
       const POSITION *mv_ref = &mv_ref_search[i];
       if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
         const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
-                                              * xd->mi_stride].src_mi->mbmi;
+                                              * xd->mi_stride]->mbmi;
 
         // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate);
+        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, Done);
       }
     }
   }
 
   // Since we still don't have a candidate we'll try the last frame.
-  if (prev_mbmi)
-    IF_DIFF_REF_FRAME_ADD_MV(prev_mbmi);
+  if (cm->use_prev_frame_mvs) {
+    if (prev_frame_mvs->ref_frame[0] != ref_frame &&
+        prev_frame_mvs->ref_frame[0] > INTRA_FRAME) {
+      int_mv mv = prev_frame_mvs->mv[0];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[0]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+
+    if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
+        prev_frame_mvs->ref_frame[1] != ref_frame &&
+        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int) {
+      int_mv mv = prev_frame_mvs->mv[1];
+      if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
+          ref_sign_bias[ref_frame]) {
+        mv.as_mv.row *= -1;
+        mv.as_mv.col *= -1;
+      }
+      ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done);
+    }
+  }
 
  Done:
 
@@ -109,9 +149,10 @@ void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
                       int_mv *mv_ref_list,
-                      int mi_row, int mi_col) {
+                      int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data) {
   find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1,
-                   mi_row, mi_col);
+                   mi_row, mi_col, sync, data);
 }
 
 static void lower_mv_precision(MV *mv, int allow_hp) {
@@ -125,43 +166,44 @@ static void lower_mv_precision(MV *mv, int allow_hp) {
 }
 
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
-                           int_mv *mvlist, int_mv *nearest, int_mv *near) {
+                           int_mv *mvlist, int_mv *nearest_mv,
+                           int_mv *near_mv) {
   int i;
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
     lower_mv_precision(&mvlist[i].as_mv, allow_hp);
     clamp_mv2(&mvlist[i].as_mv, xd);
   }
-  *nearest = mvlist[0];
-  *near = mvlist[1];
+  *nearest_mv = mvlist[0];
+  *near_mv = mvlist[1];
 }
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest, int_mv *near) {
+                                   int_mv *nearest_mv, int_mv *near_mv) {
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
-  MODE_INFO *const mi = xd->mi[0].src_mi;
+  MODE_INFO *const mi = xd->mi[0];
   b_mode_info *bmi = mi->bmi;
   int n;
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
   find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col);
+                   mi_row, mi_col, NULL, NULL);
 
-  near->as_int = 0;
+  near_mv->as_int = 0;
   switch (block) {
     case 0:
-      nearest->as_int = mv_list[0].as_int;
-      near->as_int = mv_list[1].as_int;
+      nearest_mv->as_int = mv_list[0].as_int;
+      near_mv->as_int = mv_list[1].as_int;
       break;
     case 1:
     case 2:
-      nearest->as_int = bmi[0].as_mv[ref].as_int;
+      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
       for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest->as_int != mv_list[n].as_int) {
-          near->as_int = mv_list[n].as_int;
+        if (nearest_mv->as_int != mv_list[n].as_int) {
+          near_mv->as_int = mv_list[n].as_int;
           break;
         }
       break;
@@ -172,10 +214,10 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
       candidates[2] = mv_list[0];
       candidates[3] = mv_list[1];
 
-      nearest->as_int = bmi[2].as_mv[ref].as_int;
+      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
       for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
-        if (nearest->as_int != candidates[n].as_int) {
-          near->as_int = candidates[n].as_int;
+        if (nearest_mv->as_int != candidates[n].as_int) {
+          near_mv->as_int = candidates[n].as_int;
           break;
         }
       break;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
index a937b7823a5..f1df521468f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_mvref_common.h
@@ -158,29 +158,32 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
 // skip all additional processing and jump to done!
-#define ADD_MV_REF_LIST(mv) \
+#define ADD_MV_REF_LIST(mv, refmv_count, mv_ref_list, Done) \
   do { \
     if (refmv_count) { \
-      if ((mv).as_int != mv_ref_list[0].as_int) { \
-        mv_ref_list[refmv_count] = (mv); \
+      if ((mv).as_int != (mv_ref_list)[0].as_int) { \
+        (mv_ref_list)[(refmv_count)] = (mv); \
         goto Done; \
       } \
     } else { \
-      mv_ref_list[refmv_count++] = (mv); \
+      (mv_ref_list)[(refmv_count)++] = (mv); \
     } \
   } while (0)
 
 // If either reference frame is different, not INTRA, and they
 // are different from each other scale and add the mv to our list.
-#define IF_DIFF_REF_FRAME_ADD_MV(mbmi) \
+#define IF_DIFF_REF_FRAME_ADD_MV(mbmi, ref_frame, ref_sign_bias, refmv_count, \
+                                 mv_ref_list, Done) \
   do { \
     if (is_inter_block(mbmi)) { \
       if ((mbmi)->ref_frame[0] != ref_frame) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias)); \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, Done); \
       if (has_second_ref(mbmi) && \
           (mbmi)->ref_frame[1] != ref_frame && \
           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) \
-        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias)); \
+        ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
+                        refmv_count, mv_ref_list, Done); \
     } \
   } while (0)
 
@@ -204,21 +207,23 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                       const TileInfo *const tile,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int mi_row, int mi_col);
+                      int_mv *mv_ref_list, int mi_row, int mi_col,
+                      find_mv_refs_sync sync, void *const data);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
 // score to use as ref motion vector
 void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
-                           int_mv *mvlist, int_mv *nearest, int_mv *near);
+                           int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
                                    int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest, int_mv *near);
+                                   int_mv *nearest_mv, int_mv *near_mv);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
index f1eda911737..5179c690633 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_onyxc_int.h
@@ -20,6 +20,7 @@
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_frame_buffers.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_thread.h"
 #include "vp9/common/vp9_tile_common.h"
 
 #if CONFIG_VP9_POSTPROC
@@ -35,14 +36,19 @@ extern "C" {
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
-// 1 scratch frame for the new frame, 3 for scaled references on the encoder
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 4)
+#define FRAME_BUFFERS (REF_FRAMES + 7)
 
 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
 
+#define NUM_PING_PONG_BUFFERS 2
+
 extern const struct {
   PARTITION_CONTEXT above;
   PARTITION_CONTEXT left;
@@ -56,21 +62,55 @@ typedef enum {
   REFERENCE_MODES       = 3,
 } REFERENCE_MODE;
 
+typedef struct {
+  int_mv mv[2];
+  MV_REFERENCE_FRAME ref_frame[2];
+} MV_REF;
 
 typedef struct {
   int ref_count;
+  MV_REF *mvs;
+  int mi_rows;
+  int mi_cols;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
+
+  // The Following variables will only be used in frame parallel decode.
+
+  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+  // that no FrameWorker owns, or is decoding, this buffer.
+  VP9Worker *frame_worker_owner;
+
+  // row and col indicate which position frame has been decoded to in real
+  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+  // when the frame is fully decoded.
+  int row;
+  int col;
 } RefCntBuffer;
 
-typedef struct VP9Common {
-  struct vpx_internal_error_info  error;
+typedef struct BufferPool {
+  // Protect BufferPool from being accessed by several FrameWorkers at
+  // the same time during frame parallel decode.
+  // TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t pool_mutex;
+#endif
+
+  // Private data associated with the frame buffer callbacks.
+  void *cb_priv;
 
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+  vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+  vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+  RefCntBuffer frame_bufs[FRAME_BUFFERS];
 
-  COLOR_SPACE color_space;
+  // Frame buffers allocated internally by the codec.
+  InternalFrameBufferList int_frame_buffers;
+} BufferPool;
 
+typedef struct VP9Common {
+  struct vpx_internal_error_info  error;
+  vpx_color_space_t color_space;
   int width;
   int height;
   int display_width;
@@ -89,11 +129,17 @@ typedef struct VP9Common {
 #endif
 
   YV12_BUFFER_CONFIG *frame_to_show;
+  RefCntBuffer *prev_frame;
 
-  RefCntBuffer frame_bufs[FRAME_BUFFERS];
+  // TODO(hkuang): Combine this with cur_buf in macroblockd.
+  RefCntBuffer *cur_frame;
 
   int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
+  // Prepare ref_frame_map for the next frame.
+  // Only used in frame parallel decode.
+  int next_ref_frame_map[REF_FRAMES];
+
   // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
   // roll new_fb_idx into it.
 
@@ -102,7 +148,10 @@ typedef struct VP9Common {
 
   int new_fb_idx;
 
+#if CONFIG_VP9_POSTPROC
   YV12_BUFFER_CONFIG post_proc_buffer;
+  YV12_BUFFER_CONFIG post_proc_buffer_int;
+#endif
 
   FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
   FRAME_TYPE frame_type;
@@ -135,22 +184,42 @@ typedef struct VP9Common {
   int y_dc_delta_q;
   int uv_dc_delta_q;
   int uv_ac_delta_q;
+  int16_t y_dequant[MAX_SEGMENTS][2];
+  int16_t uv_dequant[MAX_SEGMENTS][2];
 
   /* We allocate a MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
-
-  int mi_idx;
-  int prev_mi_idx;
   int mi_alloc_size;
-  MODE_INFO *mip_array[2];
-
   MODE_INFO *mip; /* Base of allocated array */
   MODE_INFO *mi;  /* Corresponds to upper left visible macroblock */
+
+  // TODO(agrange): Move prev_mi into encoder structure.
+  // prev_mip and prev_mi will only be allocated in VP9 encoder.
   MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
   MODE_INFO *prev_mi;  /* 'mi' from last frame (points into prev_mip) */
 
+  // Separate mi functions between encoder and decoder.
+  int (*alloc_mi)(struct VP9Common *cm, int mi_size);
+  void (*free_mi)(struct VP9Common *cm);
+  void (*setup_mi)(struct VP9Common *cm);
+
+  // Grid of pointers to 8x8 MODE_INFO structs.  Any 8x8 not in the visible
+  // area will be NULL.
+  MODE_INFO **mi_grid_base;
+  MODE_INFO **mi_grid_visible;
+  MODE_INFO **prev_mi_grid_base;
+  MODE_INFO **prev_mi_grid_visible;
+
+  // Whether to use previous frame's motion vectors for prediction.
+  int use_prev_frame_mvs;
+
   // Persistent mb segment id map used in prediction.
-  unsigned char *last_frame_seg_map;
+  int seg_map_idx;
+  int prev_seg_map_idx;
+
+  uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
+  uint8_t *last_frame_seg_map;
+  uint8_t *current_frame_seg_map;
 
   INTERP_FILTER interp_filter;
 
@@ -163,14 +232,17 @@ typedef struct VP9Common {
   struct loopfilter lf;
   struct segmentation seg;
 
+  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
+  // in pbi.
+  int frame_parallel_decode;  // frame-based threading.
+
   // Context probabilities for reference frame prediction
-  int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
   REFERENCE_MODE reference_mode;
 
-  FRAME_CONTEXT fc;  /* this frame entropy */
-  FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS];
+  FRAME_CONTEXT *fc;  /* this frame entropy */
+  FRAME_CONTEXT *frame_contexts;   // FRAME_CONTEXTS
   unsigned int  frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
@@ -189,6 +261,7 @@ typedef struct VP9Common {
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
+  int byte_alignment;
 
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
@@ -198,31 +271,43 @@ typedef struct VP9Common {
   // Handles memory for the codec.
   InternalFrameBufferList int_frame_buffers;
 
+  // External BufferPool passed from outside.
+  BufferPool *buffer_pool;
+
   PARTITION_CONTEXT *above_seg_context;
   ENTROPY_CONTEXT *above_context;
 } VP9_COMMON;
 
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+void lock_buffer_pool(BufferPool *const pool);
+void unlock_buffer_pool(BufferPool *const pool);
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES)
     return NULL;
   if (cm->ref_frame_map[index] < 0)
     return NULL;
   assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
-  return &cm->frame_bufs[cm->ref_frame_map[index]].buf;
+  return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
 }
 
 static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
-  return &cm->frame_bufs[cm->new_fb_idx].buf;
+  return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }
 
 static INLINE int get_free_fb(VP9_COMMON *cm) {
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
-  for (i = 0; i < FRAME_BUFFERS; i++)
-    if (cm->frame_bufs[i].ref_count == 0)
+
+  lock_buffer_pool(cm->buffer_pool);
+  for (i = 0; i < FRAME_BUFFERS; ++i)
+    if (frame_bufs[i].ref_count == 0)
       break;
 
   assert(i < FRAME_BUFFERS);
-  cm->frame_bufs[i].ref_count = 1;
+  frame_bufs[i].ref_count = 1;
+  unlock_buffer_pool(cm->buffer_pool);
   return i;
 }
 
@@ -245,13 +330,14 @@ static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    xd->plane[i].dqcoeff = xd->dqcoeff[i];
+    xd->plane[i].dqcoeff = xd->dqcoeff;
     xd->above_context[i] = cm->above_context +
         i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
   }
 
   xd->above_seg_context = cm->above_seg_context;
   xd->mi_stride = cm->mi_stride;
+  xd->error_info = &cm->error;
 }
 
 static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
@@ -261,7 +347,7 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
 static INLINE const vp9_prob* get_partition_probs(const VP9_COMMON *cm,
                                                   int ctx) {
   return frame_is_intra_only(cm) ? vp9_kf_partition_probs[ctx]
-                                 : cm->fc.partition_prob[ctx];
+                                 : cm->fc->partition_prob[ctx];
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
@@ -292,17 +378,23 @@ static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
   // Are edges available for intra prediction?
   xd->up_available    = (mi_row != 0);
   xd->left_available  = (mi_col > tile->mi_col_start);
-}
+  if (xd->up_available) {
+    xd->above_mi = xd->mi[-xd->mi_stride];
+    // above_mi may be NULL in VP9 encoder's first pass.
+    xd->above_mbmi = xd->above_mi ? &xd->above_mi->mbmi : NULL;
+  } else {
+    xd->above_mi = NULL;
+    xd->above_mbmi = NULL;
+  }
 
-static INLINE void set_prev_mi(VP9_COMMON *cm) {
-  const int use_prev_in_find_mv_refs = cm->width == cm->last_width &&
-                                       cm->height == cm->last_height &&
-                                       !cm->intra_only &&
-                                       cm->last_show_frame;
-  // Special case: set prev_mi to NULL when the previous mode info
-  // context cannot be used.
-  cm->prev_mi = use_prev_in_find_mv_refs ?
-                  cm->prev_mip + cm->mi_stride + 1 : NULL;
+  if (xd->left_available) {
+    xd->left_mi = xd->mi[-1];
+    // left_mi may be NULL in VP9 encoder's first pass.
+    xd->left_mbmi = xd->left_mi ? &xd->left_mi->mbmi : NULL;
+  } else {
+    xd->left_mi = NULL;
+    xd->left_mbmi = NULL;
+  }
 }
 
 static INLINE void update_partition_context(MACROBLOCKD *xd,
@@ -318,8 +410,8 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs);
-  vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
+  memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  memset(left_ctx, partition_context_lookup[subsize].left, bs);
 }
 
 static INLINE int partition_plane_context(const MACROBLOCKD *xd,
@@ -327,21 +419,12 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd,
                                           BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
   const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
-
   const int bsl = mi_width_log2_lookup[bsize];
-  const int bs = 1 << bsl;
-  int above = 0, left = 0, i;
+  int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
 
   assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
   assert(bsl >= 0);
 
-  for (i = 0; i < bs; i++) {
-    above |= above_ctx[i];
-    left |= left_ctx[i];
-  }
-  above = (above & bs) > 0;
-  left  = (left & bs) > 0;
-
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
index 575ffbc30ac..983a4744dd6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.c
@@ -79,6 +79,9 @@ const short vp9_rv[] = {
   0, 9, 5, 5, 11, 10, 13, 9, 10, 13,
 };
 
+static const uint8_t q_diff_thresh = 20;
+static const uint8_t last_q_thresh = 170;
+
 void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      uint8_t *dst_ptr,
                                      int src_pixels_per_line,
@@ -88,10 +91,7 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
                                      int flimit) {
   uint8_t const *p_src;
   uint8_t *p_dst;
-  int row;
-  int col;
-  int i;
-  int v;
+  int row, col, i, v, kernel;
   int pitch = src_pixels_per_line;
   uint8_t d[8];
   (void)dst_pixels_per_line;
@@ -102,8 +102,8 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
     p_dst = dst_ptr;
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
-      int v = p_src[col];
+      kernel = 4;
+      v = p_src[col];
 
       for (i = -2; i <= 2; i++) {
         if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -125,7 +125,7 @@ void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr,
       d[i] = p_src[i];
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
+      kernel = 4;
       v = p_src[col];
 
       d[col & 7] = v;
@@ -165,10 +165,7 @@ void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
                                             int flimit) {
   uint16_t const *p_src;
   uint16_t *p_dst;
-  int row;
-  int col;
-  int i;
-  int v;
+  int row, col, i, v, kernel;
   int pitch = src_pixels_per_line;
   uint16_t d[8];
 
@@ -178,8 +175,8 @@ void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
     p_dst = dst_ptr;
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
-      int v = p_src[col];
+      kernel = 4;
+      v = p_src[col];
 
       for (i = -2; i <= 2; i++) {
         if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -202,7 +199,7 @@ void vp9_highbd_post_proc_down_and_across_c(const uint16_t *src_ptr,
       d[i] = p_src[i];
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
+      kernel = 4;
       v = p_src[col];
 
       d[col & 7] = v;
@@ -515,22 +512,24 @@ void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
     assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
            (dst->flags & YV12_FLAG_HIGHBITDEPTH));
     if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-      const uint16_t *const src = CONVERT_TO_SHORTPTR(srcs[i] + 2 * src_stride
-                                                      + 2);
-      uint16_t *const dst = CONVERT_TO_SHORTPTR(dsts[i] + 2 * dst_stride + 2);
-      vp9_highbd_post_proc_down_and_across(src, dst, src_stride, dst_stride,
-                                           src_height, src_width, ppl);
+      const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
+          srcs[i] + 2 * src_stride + 2);
+      uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
+          dsts[i] + 2 * dst_stride + 2);
+      vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                           dst_stride, src_height, src_width,
+                                           ppl);
     } else {
-      const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
-      uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+      const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+      uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
 
-      vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
-                                    src_height, src_width, ppl);
+      vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                    dst_stride, src_height, src_width, ppl);
     }
 #else
-    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
-    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
-    vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+    const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+    uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+    vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
                                   src_height, src_width, ppl);
 #endif
   }
@@ -555,16 +554,15 @@ static void fillrd(struct postproc_state *state, int q, int a) {
    * a gaussian distribution with sigma determined by q.
    */
   {
-    double i;
     int next, j;
 
     next = 0;
 
     for (i = -32; i < 32; i++) {
-      int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+      int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
 
-      if (a) {
-        for (j = 0; j < a; j++) {
+      if (a_i) {
+        for (j = 0; j < a_i; j++) {
           char_dist[next + j] = (char) i;
         }
 
@@ -616,9 +614,20 @@ void vp9_plane_add_noise_c(uint8_t *start, char *noise,
   }
 }
 
+static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO *temp = cm->postproc_state.prev_mip;
+  cm->postproc_state.prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->postproc_state.prev_mi = cm->postproc_state.prev_mip + cm->mi_stride + 1;
+}
+
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *ppflags) {
-  const int q = MIN(63, cm->lf.filter_level * 10 / 6);
+  const int q = MIN(105, cm->lf.filter_level * 2);
   const int flags = ppflags->post_proc_flag;
   YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
   struct postproc_state *const ppstate = &cm->postproc_state;
@@ -633,18 +642,74 @@ int vp9_post_proc_frame(struct VP9Common *cm,
 
   vp9_clear_system_state();
 
-#if CONFIG_VP9_POSTPROC || CONFIG_INTERNAL_STATS
+  // Alloc memory for prev_mip in the first frame.
+  if (cm->current_video_frame == 1) {
+    cm->postproc_state.last_base_qindex = cm->base_qindex;
+    cm->postproc_state.last_frame_valid = 1;
+    ppstate->prev_mip = vpx_calloc(cm->mi_alloc_size, sizeof(*cm->mip));
+    if (!ppstate->prev_mip) {
+      return 1;
+    }
+    ppstate->prev_mi = ppstate->prev_mip + cm->mi_stride + 1;
+    memset(ppstate->prev_mip, 0,
+           cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  }
+
+  // Allocate post_proc_buffer_int if needed.
+  if ((flags & VP9D_MFQE) && !cm->post_proc_buffer_int.buffer_alloc) {
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      const int width = ALIGN_POWER_OF_TWO(cm->width, 4);
+      const int height = ALIGN_POWER_OF_TWO(cm->height, 4);
+
+      if (vp9_alloc_frame_buffer(&cm->post_proc_buffer_int, width, height,
+                                 cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 cm->use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cm->byte_alignment) < 0) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate MFQE framebuffer");
+      }
+
+      // Ensure that postproc is set to all 0s so that post proc
+      // doesn't pull random data in from edge.
+      memset(cm->post_proc_buffer_int.buffer_alloc, 128,
+             cm->post_proc_buffer.frame_size);
+    }
+  }
+
   if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0)
+                               VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL) < 0)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate post-processing buffer");
-#endif
 
-  if (flags & VP9D_DEMACROBLOCK) {
+  if ((flags & VP9D_MFQE) && cm->current_video_frame >= 2 &&
+      cm->postproc_state.last_frame_valid && cm->bit_depth == 8 &&
+      cm->postproc_state.last_base_qindex <= last_q_thresh &&
+      cm->base_qindex - cm->postproc_state.last_base_qindex >= q_diff_thresh) {
+    vp9_mfqe(cm);
+    // TODO(jackychen): Consider whether enable deblocking by default
+    // if mfqe is enabled. Need to take both the quality and the speed
+    // into consideration.
+    if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
+      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+    }
+    if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
+      deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
+                                 q + (ppflags->deblocking_level - 5) * 10,
+                                 1, 0);
+    } else if (flags & VP9D_DEBLOCK) {
+      vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q);
+    } else {
+      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+    }
+  } else if (flags & VP9D_DEMACROBLOCK) {
     deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
                                q + (ppflags->deblocking_level - 5) * 10, 1, 0);
   } else if (flags & VP9D_DEBLOCK) {
@@ -653,6 +718,9 @@ int vp9_post_proc_frame(struct VP9Common *cm,
     vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
+  cm->postproc_state.last_base_qindex = cm->base_qindex;
+  cm->postproc_state.last_frame_valid = 1;
+
   if (flags & VP9D_ADDNOISE) {
     const int noise_level = ppflags->noise_level;
     if (ppstate->last_q != q ||
@@ -673,6 +741,7 @@ int vp9_post_proc_frame(struct VP9Common *cm,
   dest->uv_width = dest->y_width >> cm->subsampling_x;
   dest->uv_height = dest->y_height >> cm->subsampling_y;
 
+  swap_mi_and_prev_mi(cm);
   return 0;
 }
-#endif
+#endif  // CONFIG_VP9_POSTPROC
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
index ebebc1ae346..035c9cdf846 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_postproc.h
@@ -14,6 +14,8 @@
 
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_mfqe.h"
 #include "vp9/common/vp9_ppflags.h"
 
 #ifdef __cplusplus
@@ -24,6 +26,10 @@ struct postproc_state {
   int last_q;
   int last_noise;
   char noise[3072];
+  int last_base_qindex;
+  int last_frame_valid;
+  MODE_INFO *prev_mip;
+  MODE_INFO *prev_mi;
   DECLARE_ALIGNED(16, char, blackclamp[16]);
   DECLARE_ALIGNED(16, char, whiteclamp[16]);
   DECLARE_ALIGNED(16, char, bothclamp[16]);
@@ -31,6 +37,8 @@ struct postproc_state {
 
 struct VP9Common;
 
+#define MFQE_PRECISION 4
+
 int vp9_post_proc_frame(struct VP9Common *cm,
                         YV12_BUFFER_CONFIG *dest, vp9_ppflags_t *flags);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
index 1644a1bbbe8..12b989f43a5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h
@@ -26,7 +26,8 @@ enum {
   VP9D_DEBUG_TXT_RATE_INFO    = 1 << 6,
   VP9D_DEBUG_DRAW_MV          = 1 << 7,
   VP9D_DEBUG_CLR_BLK_MODES    = 1 << 8,
-  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9
+  VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9,
+  VP9D_MFQE                   = 1 << 10
 };
 
 typedef struct {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
index 901a043f69b..0aac4a9e677 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.c
@@ -15,21 +15,17 @@
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
-static INLINE const MB_MODE_INFO *get_mbmi(const MODE_INFO *const mi) {
-  return (mi != NULL) ? &mi->mbmi : NULL;
-}
-
 // Returns a context number for the given MB prediction signal
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int left_type = left_mbmi != NULL && is_inter_block(left_mbmi) ?
-                           left_mbmi->interp_filter : SWITCHABLE_FILTERS;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const int above_type = above_mbmi != NULL && is_inter_block(above_mbmi) ?
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int left_type = xd->left_available && is_inter_block(left_mbmi) ?
+                            left_mbmi->interp_filter : SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const int above_type = xd->up_available && is_inter_block(above_mbmi) ?
                              above_mbmi->interp_filter : SWITCHABLE_FILTERS;
 
   if (left_type == above_type)
@@ -50,10 +46,10 @@ int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
 // 2 - intra/--, --/intra
 // 3 - intra/intra
 int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
 
   if (has_above && has_left) {  // both edges available
     const int above_intra = !is_inter_block(above_mbmi);
@@ -70,10 +66,10 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
 int vp9_get_reference_mode_context(const VP9_COMMON *cm,
                                    const MACROBLOCKD *xd) {
   int ctx;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -113,10 +109,10 @@ int vp9_get_reference_mode_context(const VP9_COMMON *cm,
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
                                     const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int above_in_image = above_mbmi != NULL;
-  const int left_in_image = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
 
   // Note:
   // The mode info data structure has a one element border above and to the
@@ -194,10 +190,10 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
@@ -260,10 +256,10 @@ int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
 
   // Note:
   // The mode info data structure has a one element border above and to the
@@ -348,11 +344,11 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
 int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
-  const int max_tx_size = max_txsize_lookup[xd->mi[0].src_mi->mbmi.sb_type];
-  const MB_MODE_INFO *const above_mbmi = get_mbmi(get_above_mi(xd));
-  const MB_MODE_INFO *const left_mbmi = get_mbmi(get_left_mi(xd));
-  const int has_above = above_mbmi != NULL;
-  const int has_left = left_mbmi != NULL;
+  const int max_tx_size = max_txsize_lookup[xd->mi[0]->mbmi.sb_type];
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
   int above_ctx = (has_above && !above_mbmi->skip) ? (int)above_mbmi->tx_size
                                                    : max_tx_size;
   int left_ctx = (has_left && !left_mbmi->skip) ? (int)left_mbmi->tx_size
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
index 39774f14285..bc19d28b906 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_pred_common.h
@@ -18,20 +18,12 @@
 extern "C" {
 #endif
 
-static INLINE const MODE_INFO *get_above_mi(const MACROBLOCKD *const xd) {
-  return xd->up_available ? xd->mi[-xd->mi_stride].src_mi : NULL;
-}
-
-static INLINE const MODE_INFO *get_left_mi(const MACROBLOCKD *const xd) {
-  return xd->left_available ? xd->mi[-1].src_mi : NULL;
-}
-
 int vp9_get_segment_id(const VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col);
 
 static INLINE int vp9_get_pred_context_seg_id(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
   const int above_sip = (above_mi != NULL) ?
                         above_mi->mbmi.seg_id_predicted : 0;
   const int left_sip = (left_mi != NULL) ? left_mi->mbmi.seg_id_predicted : 0;
@@ -45,8 +37,8 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(const struct segmentation *seg,
 }
 
 static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
   const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip : 0;
   const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip : 0;
   return above_skip + left_skip;
@@ -54,7 +46,7 @@ static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
 
 static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
                                          const MACROBLOCKD *xd) {
-  return cm->fc.skip_probs[vp9_get_skip_context(xd)];
+  return cm->fc->skip_probs[vp9_get_skip_context(xd)];
 }
 
 int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
@@ -63,14 +55,14 @@ int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd) {
-  return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)];
+  return cm->fc->intra_inter_prob[vp9_get_intra_inter_context(xd)];
 }
 
 int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
                                                    const MACROBLOCKD *xd) {
-  return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
+  return cm->fc->comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
 }
 
 int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
@@ -79,21 +71,21 @@ int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
   const int pred_context = vp9_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc.comp_ref_prob[pred_context];
+  return cm->fc->comp_ref_prob[pred_context];
 }
 
 int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
 }
 
 int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
+  return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
 
 int vp9_get_tx_size_context(const MACROBLOCKD *xd);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
index a1befc63e88..3b7b9bf3b39 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.c
@@ -29,33 +29,25 @@ const uint8_t vp9_norm[256] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-
 static unsigned int tree_merge_probs_impl(unsigned int i,
                                           const vp9_tree_index *tree,
                                           const vp9_prob *pre_probs,
                                           const unsigned int *counts,
-                                          unsigned int count_sat,
-                                          unsigned int max_update,
                                           vp9_prob *probs) {
   const int l = tree[i];
   const unsigned int left_count = (l <= 0)
                  ? counts[-l]
-                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
-                                         count_sat, max_update, probs);
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts, probs);
   const int r = tree[i + 1];
   const unsigned int right_count = (r <= 0)
                  ? counts[-r]
-                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
-                                         count_sat, max_update, probs);
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts, probs);
   const unsigned int ct[2] = { left_count, right_count };
-  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
-                              count_sat, max_update);
+  probs[i >> 1] = mode_mv_merge_probs(pre_probs[i >> 1], ct);
   return left_count + right_count;
 }
 
 void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, unsigned int count_sat,
-                          unsigned int max_update_factor, vp9_prob *probs) {
-  tree_merge_probs_impl(0, tree, pre_probs, counts, count_sat,
-                        max_update_factor, probs);
+                          const unsigned int *counts, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts, probs);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
index bc1511a5e18..c69c62c81f8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_prob.h
@@ -33,6 +33,8 @@ typedef int8_t vp9_tree_index;
 
 #define vp9_complement(x) (255 - x)
 
+#define MODE_MV_COUNT_SAT 20
+
 /* We build coding trees compactly in arrays.
    Each node of the tree is a pair of vp9_tree_indices.
    Array index often references a corresponding probability table.
@@ -69,9 +71,28 @@ static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
   return weighted_prob(pre_prob, prob, factor);
 }
 
+// MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+static const int count_to_update_factor[MODE_MV_COUNT_SAT + 1] = {
+  0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64,
+  70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+};
+
+static INLINE vp9_prob mode_mv_merge_probs(vp9_prob pre_prob,
+                                           const unsigned int ct[2]) {
+  const unsigned int den = ct[0] + ct[1];
+  if (den == 0) {
+    return pre_prob;
+  } else {
+    const unsigned int count = MIN(den, MODE_MV_COUNT_SAT);
+    const unsigned int factor = count_to_update_factor[count];
+    const vp9_prob prob =
+        clip_prob(((int64_t)(ct[0]) * 256 + (den >> 1)) / den);
+    return weighted_prob(pre_prob, prob, factor);
+  }
+}
+
 void vp9_tree_merge_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
-                          const unsigned int *counts, unsigned int count_sat,
-                          unsigned int max_update_factor, vp9_prob *probs);
+                          const unsigned int *counts, vp9_prob *probs);
 
 
 DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
index 3492a23d01c..11eaf2e2d70 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.c
@@ -20,97 +20,7 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-static void build_mc_border(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            int x, int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      memset(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right)
-      memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void high_build_mc_border(const uint8_t *src8, int src_stride,
-                                 uint16_t *dst, int dst_stride,
-                                 int x, int y, int b_w, int b_h,
-                                 int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      vpx_memset16(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
-
-    if (right)
-      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static void inter_predictor(const uint8_t *src, int src_stride,
+void inter_predictor(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
                             const int subpel_x,
                             const int subpel_y,
@@ -123,29 +33,8 @@ static void inter_predictor(const uint8_t *src, int src_stride,
       kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
 }
 
-void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
-                               uint8_t *dst, int dst_stride,
-                               const MV *src_mv,
-                               const struct scale_factors *sf,
-                               int w, int h, int ref,
-                               const InterpKernel *kernel,
-                               enum mv_precision precision,
-                               int x, int y) {
-  const int is_q4 = precision == MV_PRECISION_Q4;
-  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
-                     is_q4 ? src_mv->col : src_mv->col * 2 };
-  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
-  const int subpel_x = mv.col & SUBPEL_MASK;
-  const int subpel_y = mv.row & SUBPEL_MASK;
-
-  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
-
-  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
-static void high_inter_predictor(const uint8_t *src, int src_stride,
+void high_inter_predictor(const uint8_t *src, int src_stride,
                                  uint8_t *dst, int dst_stride,
                                  const int subpel_x,
                                  const int subpel_y,
@@ -180,6 +69,27 @@ void vp9_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
+                               uint8_t *dst, int dst_stride,
+                               const MV *src_mv,
+                               const struct scale_factors *sf,
+                               int w, int h, int ref,
+                               const InterpKernel *kernel,
+                               enum mv_precision precision,
+                               int x, int y) {
+  const int is_q4 = precision == MV_PRECISION_Q4;
+  const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+                     is_q4 ? src_mv->col : src_mv->col * 2 };
+  MV32 mv = vp9_scale_mv(&mv_q4, x, y, sf);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
+
+  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
+}
+
 static INLINE int round_mv_comp_q4(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
@@ -234,8 +144,8 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
   return clamped_mv;
 }
 
-static MV average_split_mvs(const struct macroblockd_plane *pd,
-                            const MODE_INFO *mi, int ref, int block) {
+MV average_split_mvs(const struct macroblockd_plane *pd,
+                     const MODE_INFO *mi, int ref, int block) {
   const int ss_idx = ((pd->subsampling_x > 0) << 1) | (pd->subsampling_y > 0);
   MV res = {0, 0};
   switch (ss_idx) {
@@ -252,17 +162,17 @@ static MV average_split_mvs(const struct macroblockd_plane *pd,
       res = mi_mv_pred_q4(mi, ref);
       break;
     default:
-      assert(ss_idx <= 3 || ss_idx >= 0);
+      assert(ss_idx <= 3 && ss_idx >= 0);
   }
   return res;
 }
 
-static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
                                    int bw, int bh,
                                    int x, int y, int w, int h,
                                    int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0].src_mi;
+  const MODE_INFO *mi = xd->mi[0];
   const int is_compound = has_second_ref(&mi->mbmi);
   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
   int ref;
@@ -336,7 +246,7 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
     const int bw = 4 * num_4x4_w;
     const int bh = 4 * num_4x4_h;
 
-    if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
       int i = 0, x, y;
       assert(bsize == BLOCK_8X8);
       for (y = 0; y < num_4x4_h; ++y)
@@ -354,231 +264,31 @@ void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
 }
+
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane) {
+  build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+}
+
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
 }
+
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
                                     MAX_MB_PLANE - 1);
 }
 
-// TODO(jingning): This function serves as a placeholder for decoder prediction
-// using on demand border extension. It should be moved to /decoder/ directory.
-static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                       int bw, int bh,
-                                       int x, int y, int w, int h,
-                                       int mi_x, int mi_y) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0].src_mi;
-  const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
-  int ref;
-
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-    struct buf_2d *const pre_buf = &pd->pre[ref];
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? average_split_mvs(pd, mi, ref, block)
-               : mi->mbmi.mv[ref].as_mv;
-
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
-                                               pd->subsampling_x,
-                                               pd->subsampling_y);
-
-    MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height, buf_stride,
-        subpel_x, subpel_y;
-    uint8_t *ref_frame, *buf_ptr;
-    const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
-    const int is_scaled = vp9_is_scaled(sf);
-
-    // Get reference frame pointer, width and height.
-    if (plane == 0) {
-      frame_width = ref_buf->y_crop_width;
-      frame_height = ref_buf->y_crop_height;
-      ref_frame = ref_buf->y_buffer;
-    } else {
-      frame_width = ref_buf->uv_crop_width;
-      frame_height = ref_buf->uv_crop_height;
-      ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
-    }
-
-    if (is_scaled) {
-      // Co-ordinate of containing block to pixel precision.
-      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = (x_start + x) << SUBPEL_BITS;
-      y0_16 = (y_start + y) << SUBPEL_BITS;
-
-      // Co-ordinate of current block in reference frame
-      // to 1/16th pixel precision.
-      x0_16 = sf->scale_value_x(x0_16, sf);
-      y0_16 = sf->scale_value_y(y0_16, sf);
-
-      // Map the top left corner of the block into the reference frame.
-      x0 = sf->scale_value_x(x_start + x, sf);
-      y0 = sf->scale_value_y(y_start + y, sf);
-
-      // Scale the MV and incorporate the sub-pixel offset of the block
-      // in the reference frame.
-      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-      xs = sf->x_step_q4;
-      ys = sf->y_step_q4;
-    } else {
-      // Co-ordinate of containing block to pixel precision.
-      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = x0 << SUBPEL_BITS;
-      y0_16 = y0 << SUBPEL_BITS;
-
-      scaled_mv.row = mv_q4.row;
-      scaled_mv.col = mv_q4.col;
-      xs = ys = 16;
-    }
-    subpel_x = scaled_mv.col & SUBPEL_MASK;
-    subpel_y = scaled_mv.row & SUBPEL_MASK;
-
-    // Calculate the top left corner of the best matching block in the
-    // reference frame.
-    x0 += scaled_mv.col >> SUBPEL_BITS;
-    y0 += scaled_mv.row >> SUBPEL_BITS;
-    x0_16 += scaled_mv.col;
-    y0_16 += scaled_mv.row;
-
-    // Get reference block pointer.
-    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-    buf_stride = pre_buf->stride;
-
-    // Do border extension if there is motion or the
-    // width/height is not a multiple of 8 pixels.
-    if (is_scaled || scaled_mv.col || scaled_mv.row ||
-        (frame_width & 0x7) || (frame_height & 0x7)) {
-      // Get reference block bottom right coordinate.
-      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-      int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-      int x_pad = 0, y_pad = 0;
-
-      if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
-        x0 -= VP9_INTERP_EXTEND - 1;
-        x1 += VP9_INTERP_EXTEND;
-        x_pad = 1;
-      }
-
-      if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
-        y0 -= VP9_INTERP_EXTEND - 1;
-        y1 += VP9_INTERP_EXTEND;
-        y_pad = 1;
-      }
-
-      // Skip border extension if block is inside the frame.
-      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
-          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
-        // Extend the border.
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          high_build_mc_border(buf_ptr1,
-                               pre_buf->stride,
-                               xd->mc_buf_high,
-                               x1 - x0 + 1,
-                               x0,
-                               y0,
-                               x1 - x0 + 1,
-                               y1 - y0 + 1,
-                               frame_width,
-                               frame_height);
-          buf_stride = x1 - x0 + 1;
-          buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
-              y_pad * 3 * buf_stride + x_pad * 3;
-        } else {
-          build_mc_border(buf_ptr1,
-                          pre_buf->stride,
-                          xd->mc_buf,
-                          x1 - x0 + 1,
-                          x0,
-                          y0,
-                          x1 - x0 + 1,
-                          y1 - y0 + 1,
-                          frame_width,
-                          frame_height);
-          buf_stride = x1 - x0 + 1;
-          buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-        }
-#else
-        build_mc_border(buf_ptr1,
-                        pre_buf->stride,
-                        xd->mc_buf,
-                        x1 - x0 + 1,
-                        x0,
-                        y0,
-                        x1 - x0 + 1,
-                        y1 - y0 + 1,
-                        frame_width,
-                        frame_height);
-        buf_stride = x1 - x0 + 1;
-        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      }
-    }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-    } else {
-      inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-    }
-#else
-    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  }
-}
-
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
-                                                        &xd->plane[plane]);
-    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-    const int bw = 4 * num_4x4_w;
-    const int bh = 4 * num_4x4_h;
-
-    if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
-      int i = 0, x, y;
-      assert(bsize == BLOCK_8X8);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
-    } else {
-      dec_build_inter_predictors(xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
-    }
-  }
-}
-
 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
-  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                               src->alpha_buffer};
-  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                          src->alpha_stride};
+  uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+      src->v_buffer};
+  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+      src->uv_stride};
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -594,11 +304,10 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
                           const struct scale_factors *sf) {
   if (src != NULL) {
     int i;
-    uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                 src->alpha_buffer};
-    const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                            src->alpha_stride};
-
+    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+        src->v_buffer};
+    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+        src->uv_stride};
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
       setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
index 3eaf07cf85f..e7057445a07 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconinter.h
@@ -18,18 +18,49 @@
 extern "C" {
 #endif
 
+void inter_predictor(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            const int subpel_x,
+                            const int subpel_y,
+                            const struct scale_factors *sf,
+                            int w, int h, int ref,
+                            const InterpKernel *kernel,
+                            int xs, int ys);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_inter_predictor(const uint8_t *src, int src_stride,
+                                 uint8_t *dst, int dst_stride,
+                                 const int subpel_x,
+                                 const int subpel_y,
+                                 const struct scale_factors *sf,
+                                 int w, int h, int ref,
+                                 const InterpKernel *kernel,
+                                 int xs, int ys, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+MV average_split_mvs(const struct macroblockd_plane *pd, const MODE_INFO *mi,
+                     int ref, int block);
+
+MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
+                             int bw, int bh, int ss_x, int ss_y);
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y);
+
 void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize);
 
+void vp9_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize, int plane);
+
 void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize);
 
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
-void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
index 720bb445de5..825d03d69b2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_reconintra.c
@@ -12,6 +12,7 @@
 #include "./vp9_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
 
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -29,6 +30,25 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = {
   ADST_ADST,  // TM
 };
 
+enum {
+  NEED_LEFT = 1 << 1,
+  NEED_ABOVE = 1 << 2,
+  NEED_ABOVERIGHT = 1 << 3,
+};
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+  NEED_ABOVE | NEED_LEFT,       // DC
+  NEED_ABOVE,                   // V
+  NEED_LEFT,                    // H
+  NEED_ABOVERIGHT,              // D45
+  NEED_LEFT | NEED_ABOVE,       // D135
+  NEED_LEFT | NEED_ABOVE,       // D117
+  NEED_LEFT | NEED_ABOVE,       // D153
+  NEED_LEFT,                    // D207
+  NEED_ABOVERIGHT,              // D63
+  NEED_LEFT | NEED_ABOVE,       // TM
+};
+
 // This serves as a wrapper function, so that all the prediction functions
 // can be unified and accessed as a pointer array. Note that the boundary
 // above and left are not necessarily used all the time.
@@ -225,7 +245,7 @@ static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
   (void) left;
   (void) bd;
   for (r = 0; r < bs; r++) {
-    vpx_memcpy(dst, above, bs * sizeof(uint16_t));
+    memcpy(dst, above, bs * sizeof(uint16_t));
     dst += stride;
   }
 }
@@ -468,7 +488,7 @@ static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   (void) left;
 
   for (r = 0; r < bs; r++) {
-    vpx_memcpy(dst, above, bs);
+    memcpy(dst, above, bs);
     dst += stride;
   }
 }
@@ -480,7 +500,7 @@ static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   (void) above;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(dst, left[r], bs);
+    memset(dst, left[r], bs);
     dst += stride;
   }
 }
@@ -506,7 +526,7 @@ static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   (void) left;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(dst, 128, bs);
+    memset(dst, 128, bs);
     dst += stride;
   }
 }
@@ -523,7 +543,7 @@ static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(dst, expected_dc, bs);
+    memset(dst, expected_dc, bs);
     dst += stride;
   }
 }
@@ -539,7 +559,7 @@ static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   expected_dc = (sum + (bs >> 1)) / bs;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(dst, expected_dc, bs);
+    memset(dst, expected_dc, bs);
     dst += stride;
   }
 }
@@ -558,7 +578,7 @@ static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
   expected_dc = (sum + (count >> 1)) / count;
 
   for (r = 0; r < bs; r++) {
-    vpx_memset(dst, expected_dc, bs);
+    memset(dst, expected_dc, bs);
     dst += stride;
   }
 }
@@ -579,7 +599,7 @@ static intra_high_pred_fn pred_high[INTRA_MODES][4];
 static intra_high_pred_fn dc_pred_high[2][2][4];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-void vp9_init_intra_predictors() {
+static void vp9_init_intra_predictors_internal(void) {
 #define INIT_ALL_SIZES(p, type) \
   p[TX_4X4] = vp9_##type##_predictor_4x4; \
   p[TX_8X8] = vp9_##type##_predictor_8x8; \
@@ -637,8 +657,8 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 64);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 128 + 16);
+  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[128 + 16]);
   uint16_t *above_row = above_data + 16;
   const uint16_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
@@ -698,32 +718,26 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
       /* slower path if the block needs border extension */
       if (x0 + 2 * bs <= frame_width) {
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, 2 * bs * sizeof(uint16_t));
+          memcpy(above_row, above_ref, 2 * bs * sizeof(uint16_t));
         } else {
-          vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+          memcpy(above_row, above_ref, bs * sizeof(uint16_t));
           vpx_memset16(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 + bs <= frame_width) {
         const int r = frame_width - x0;
         if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
+          memcpy(above_row, above_ref, r * sizeof(uint16_t));
           vpx_memset16(above_row + r, above_row[r - 1],
                        x0 + 2 * bs - frame_width);
         } else {
-          vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+          memcpy(above_row, above_ref, bs * sizeof(uint16_t));
           vpx_memset16(above_row + bs, above_row[bs - 1], bs);
         }
       } else if (x0 <= frame_width) {
         const int r = frame_width - x0;
-        if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
-          vpx_memset16(above_row + r, above_row[r - 1],
-                       x0 + 2 * bs - frame_width);
-        } else {
-          vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t));
-          vpx_memset16(above_row + r, above_row[r - 1],
+        memcpy(above_row, above_ref, r * sizeof(uint16_t));
+        vpx_memset16(above_row + r, above_row[r - 1],
                        x0 + 2 * bs - frame_width);
-        }
       }
       // TODO(Peter) this value should probably change for high bitdepth
       above_row[-1] = left_available ? above_ref[-1] : (base+1);
@@ -732,9 +746,9 @@ static void build_intra_predictors_high(const MACROBLOCKD *xd,
       if (bs == 4 && right_available && left_available) {
         const_above_row = above_ref;
       } else {
-        vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t));
+        memcpy(above_row, above_ref, bs * sizeof(uint16_t));
         if (bs == 4 && right_available)
-          vpx_memcpy(above_row + bs, above_ref + bs, bs * sizeof(uint16_t));
+          memcpy(above_row + bs, above_ref + bs, bs * sizeof(uint16_t));
         else
           vpx_memset16(above_row + bs, above_row[bs - 1], bs);
         // TODO(Peter): this value should probably change for high bitdepth
@@ -766,8 +780,8 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int right_available, int x, int y,
                                    int plane) {
   int i;
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, left_col, 64);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, above_data, 128 + 16);
+  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[128 + 16]);
   uint8_t *above_row = above_data + 16;
   const uint8_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
@@ -795,81 +809,103 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
   x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
   y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
 
-  vpx_memset(left_col, 129, 64);
-
-  // left
-  if (left_available) {
-    if (xd->mb_to_bottom_edge < 0) {
-      /* slower path if the block needs border extension */
-      if (y0 + bs <= frame_height) {
-        for (i = 0; i < bs; ++i)
-          left_col[i] = ref[i * ref_stride - 1];
+  // NEED_LEFT
+  if (extend_modes[mode] & NEED_LEFT) {
+    if (left_available) {
+      if (xd->mb_to_bottom_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (y0 + bs <= frame_height) {
+          for (i = 0; i < bs; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+        } else {
+          const int extend_bottom = frame_height - y0;
+          for (i = 0; i < extend_bottom; ++i)
+            left_col[i] = ref[i * ref_stride - 1];
+          for (; i < bs; ++i)
+            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
+        }
       } else {
-        const int extend_bottom = frame_height - y0;
-        for (i = 0; i < extend_bottom; ++i)
+        /* faster path if the block does not need extension */
+        for (i = 0; i < bs; ++i)
           left_col[i] = ref[i * ref_stride - 1];
-        for (; i < bs; ++i)
-          left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
       }
     } else {
-      /* faster path if the block does not need extension */
-      for (i = 0; i < bs; ++i)
-        left_col[i] = ref[i * ref_stride - 1];
+      memset(left_col, 129, bs);
     }
   }
 
-  // TODO(hkuang) do not extend 2*bs pixels for all modes.
-  // above
-  if (up_available) {
-    const uint8_t *above_ref = ref - ref_stride;
-    if (xd->mb_to_right_edge < 0) {
-      /* slower path if the block needs border extension */
-      if (x0 + 2 * bs <= frame_width) {
-        if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, 2 * bs);
-        } else {
-          vpx_memcpy(above_row, above_ref, bs);
-          vpx_memset(above_row + bs, above_row[bs - 1], bs);
-        }
-      } else if (x0 + bs <= frame_width) {
-        const int r = frame_width - x0;
-        if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, r);
-          vpx_memset(above_row + r, above_row[r - 1],
-                     x0 + 2 * bs - frame_width);
-        } else {
-          vpx_memcpy(above_row, above_ref, bs);
-          vpx_memset(above_row + bs, above_row[bs - 1], bs);
+  // NEED_ABOVE
+  if (extend_modes[mode] & NEED_ABOVE) {
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + bs <= frame_width) {
+          memcpy(above_row, above_ref, bs);
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + bs - frame_width);
         }
-      } else if (x0 <= frame_width) {
-        const int r = frame_width - x0;
-        if (right_available && bs == 4) {
-          vpx_memcpy(above_row, above_ref, r);
-          vpx_memset(above_row + r, above_row[r - 1],
-                     x0 + 2 * bs - frame_width);
+      } else {
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
         } else {
-          vpx_memcpy(above_row, above_ref, r);
-          vpx_memset(above_row + r, above_row[r - 1],
-                     x0 + 2 * bs - frame_width);
+          memcpy(above_row, above_ref, bs);
         }
       }
       above_row[-1] = left_available ? above_ref[-1] : 129;
     } else {
-      /* faster path if the block does not need extension */
-      if (bs == 4 && right_available && left_available) {
-        const_above_row = above_ref;
+      memset(above_row, 127, bs);
+      above_row[-1] = 127;
+    }
+  }
+
+  // NEED_ABOVERIGHT
+  if (extend_modes[mode] & NEED_ABOVERIGHT) {
+    if (up_available) {
+      const uint8_t *above_ref = ref - ref_stride;
+      if (xd->mb_to_right_edge < 0) {
+        /* slower path if the block needs border extension */
+        if (x0 + 2 * bs <= frame_width) {
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, 2 * bs);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 + bs <= frame_width) {
+          const int r = frame_width - x0;
+          if (right_available && bs == 4) {
+            memcpy(above_row, above_ref, r);
+            memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+          } else {
+            memcpy(above_row, above_ref, bs);
+            memset(above_row + bs, above_row[bs - 1], bs);
+          }
+        } else if (x0 <= frame_width) {
+          const int r = frame_width - x0;
+          memcpy(above_row, above_ref, r);
+          memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
+        }
       } else {
-        vpx_memcpy(above_row, above_ref, bs);
-        if (bs == 4 && right_available)
-          vpx_memcpy(above_row + bs, above_ref + bs, bs);
-        else
-          vpx_memset(above_row + bs, above_row[bs - 1], bs);
-        above_row[-1] = left_available ? above_ref[-1] : 129;
+        /* faster path if the block does not need extension */
+        if (bs == 4 && right_available && left_available) {
+          const_above_row = above_ref;
+        } else {
+          memcpy(above_row, above_ref, bs);
+          if (bs == 4 && right_available)
+            memcpy(above_row + bs, above_ref + bs, bs);
+          else
+            memset(above_row + bs, above_row[bs - 1], bs);
+        }
       }
+      above_row[-1] = left_available ? above_ref[-1] : 129;
+    } else {
+      memset(above_row, 127, bs * 2);
+      above_row[-1] = 127;
     }
-  } else {
-    vpx_memset(above_row, 127, bs * 2);
-    above_row[-1] = 127;
   }
 
   // predict
@@ -906,3 +942,7 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
   build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
                          have_top, have_left, have_right, x, y, plane);
 }
+
+void vp9_init_intra_predictors() {
+  once(vp9_init_intra_predictors_internal);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c
index dc15a84ff17..2dfa09f50e0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd.c
@@ -12,9 +12,8 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_scale_rtcd(void);
-
 void vp9_rtcd() {
-    vpx_scale_rtcd();
+    // TODO(JBB): Remove this once, by insuring that both the encoder and
+    // decoder setup functions are protected by once();
     once(setup_rtcd_internal);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
index 0530f3a303e..d05afa525c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -66,8 +66,7 @@ add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, con
 specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_4x4 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_4x4_neon_asm=vp9_h_predictor_4x4_neon;
+specialize qw/vp9_h_predictor_4x4 neon dspr2/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_4x4/;
@@ -79,24 +78,22 @@ add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_4x4 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_4x4_neon_asm=vp9_v_predictor_4x4_neon;
+specialize qw/vp9_v_predictor_4x4 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_4x4 neon_asm dspr2/, "$sse_x86inc";
-$vp9_tm_predictor_4x4_neon_asm=vp9_tm_predictor_4x4_neon;
+specialize qw/vp9_tm_predictor_4x4 neon dspr2/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_4x4 dspr2/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_4x4/;
+specialize qw/vp9_dc_top_predictor_4x4/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_4x4/;
+specialize qw/vp9_dc_left_predictor_4x4/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_4x4/;
+specialize qw/vp9_dc_128_predictor_4x4/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc";
@@ -108,8 +105,7 @@ add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, con
 specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_8x8 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_8x8_neon_asm=vp9_h_predictor_8x8_neon;
+specialize qw/vp9_h_predictor_8x8 neon dspr2/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_8x8/;
@@ -121,24 +117,22 @@ add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, co
 specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_8x8 neon_asm/, "$sse_x86inc";
-$vp9_v_predictor_8x8_neon_asm=vp9_v_predictor_8x8_neon;
+specialize qw/vp9_v_predictor_8x8 neon/, "$sse_x86inc";
 
 add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_8x8 neon_asm dspr2/, "$sse2_x86inc";
-$vp9_tm_predictor_8x8_neon_asm=vp9_tm_predictor_8x8_neon;
+specialize qw/vp9_tm_predictor_8x8 neon dspr2/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_8x8 dspr2/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_8x8/;
+specialize qw/vp9_dc_top_predictor_8x8/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_8x8/;
+specialize qw/vp9_dc_left_predictor_8x8/, "$sse_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_8x8/;
+specialize qw/vp9_dc_128_predictor_8x8/, "$sse_x86inc";
 
 add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc";
@@ -150,8 +144,7 @@ add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, c
 specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_16x16 neon_asm dspr2/, "$ssse3_x86inc";
-$vp9_h_predictor_16x16_neon_asm=vp9_h_predictor_16x16_neon;
+specialize qw/vp9_h_predictor_16x16 neon dspr2/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_16x16/;
@@ -163,24 +156,22 @@ add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_16x16_neon_asm=vp9_v_predictor_16x16_neon;
+specialize qw/vp9_v_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_16x16 neon_asm/, "$sse2_x86inc";
-$vp9_tm_predictor_16x16_neon_asm=vp9_tm_predictor_16x16_neon;
+specialize qw/vp9_tm_predictor_16x16 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_16x16 dspr2/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_16x16/;
+specialize qw/vp9_dc_top_predictor_16x16/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_16x16/;
+specialize qw/vp9_dc_left_predictor_16x16/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_16x16/;
+specialize qw/vp9_dc_128_predictor_16x16/, "$sse2_x86inc";
 
 add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc";
@@ -192,8 +183,7 @@ add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, c
 specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_h_predictor_32x32 neon_asm/, "$ssse3_x86inc";
-$vp9_h_predictor_32x32_neon_asm=vp9_h_predictor_32x32_neon;
+specialize qw/vp9_h_predictor_32x32 neon/, "$ssse3_x86inc";
 
 add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_d117_predictor_32x32/;
@@ -205,24 +195,22 @@ add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride,
 specialize qw/vp9_d153_predictor_32x32/;
 
 add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_v_predictor_32x32 neon_asm/, "$sse2_x86inc";
-$vp9_v_predictor_32x32_neon_asm=vp9_v_predictor_32x32_neon;
+specialize qw/vp9_v_predictor_32x32 neon/, "$sse2_x86inc";
 
 add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_tm_predictor_32x32 neon_asm/, "$sse2_x86_64";
-$vp9_tm_predictor_32x32_neon_asm=vp9_tm_predictor_32x32_neon;
+specialize qw/vp9_tm_predictor_32x32 neon/, "$sse2_x86_64";
 
 add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vp9_dc_predictor_32x32/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_top_predictor_32x32/;
+specialize qw/vp9_dc_top_predictor_32x32/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_left_predictor_32x32/;
+specialize qw/vp9_dc_left_predictor_32x32/, "$sse2_x86inc";
 
 add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vp9_dc_128_predictor_32x32/;
+specialize qw/vp9_dc_128_predictor_32x32/, "$sse2_x86inc";
 
 #
 # Loopfilter
@@ -244,12 +232,10 @@ specialize qw/vp9_lpf_vertical_8_dual sse2 neon_asm dspr2/;
 $vp9_lpf_vertical_8_dual_neon_asm=vp9_lpf_vertical_8_dual_neon;
 
 add_proto qw/void vp9_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_vertical_4 mmx neon_asm dspr2/;
-$vp9_lpf_vertical_4_neon_asm=vp9_lpf_vertical_4_neon;
+specialize qw/vp9_lpf_vertical_4 mmx neon dspr2/;
 
 add_proto qw/void vp9_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_vertical_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_vertical_4_dual_neon_asm=vp9_lpf_vertical_4_dual_neon;
+specialize qw/vp9_lpf_vertical_4_dual sse2 neon dspr2/;
 
 add_proto qw/void vp9_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
 specialize qw/vp9_lpf_horizontal_16 sse2 avx2 neon_asm dspr2/;
@@ -264,12 +250,10 @@ specialize qw/vp9_lpf_horizontal_8_dual sse2 neon_asm dspr2/;
 $vp9_lpf_horizontal_8_dual_neon_asm=vp9_lpf_horizontal_8_dual_neon;
 
 add_proto qw/void vp9_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count";
-specialize qw/vp9_lpf_horizontal_4 mmx neon_asm dspr2/;
-$vp9_lpf_horizontal_4_neon_asm=vp9_lpf_horizontal_4_neon;
+specialize qw/vp9_lpf_horizontal_4 mmx neon dspr2/;
 
 add_proto qw/void vp9_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vp9_lpf_horizontal_4_dual sse2 neon_asm dspr2/;
-$vp9_lpf_horizontal_4_dual_neon_asm=vp9_lpf_horizontal_4_dual_neon;
+specialize qw/vp9_lpf_horizontal_4_dual sse2 neon dspr2/;
 
 #
 # post proc
@@ -290,42 +274,40 @@ $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm;
 add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch";
 specialize qw/vp9_plane_add_noise sse2/;
 $vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt;
+
+add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight16x16 sse2/;
+
+add_proto qw/void vp9_filter_by_weight8x8/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight";
+specialize qw/vp9_filter_by_weight8x8 sse2/;
 }
 
 #
 # Sub Pixel Filters
 #
 add_proto qw/void vp9_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_copy neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_copy_neon_asm=vp9_convolve_copy_neon;
+specialize qw/vp9_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
-$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
+specialize qw/vp9_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
 
 add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_neon_asm=vp9_convolve8_neon;
+specialize qw/vp9_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
 
 add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
 
 add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
-$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
 
 add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_neon_asm=vp9_convolve8_avg_neon;
+specialize qw/vp9_convolve8_avg sse2 ssse3 neon dspr2/;
 
 add_proto qw/void vp9_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_horiz_neon_asm=vp9_convolve8_avg_horiz_neon;
+specialize qw/vp9_convolve8_avg_horiz sse2 ssse3 neon dspr2/;
 
 add_proto qw/void vp9_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon_asm dspr2/;
-$vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
+specialize qw/vp9_convolve8_avg_vert sse2 ssse3 neon dspr2/;
 
 #
 # dct
@@ -437,59 +419,48 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     specialize qw/vp9_iwht4x4_16_add/;
   } else {
     add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
-    $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+    specialize qw/vp9_idct4x4_1_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
-    $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+    specialize qw/vp9_idct4x4_16_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
-    $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+    specialize qw/vp9_idct8x8_1_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-    $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+    specialize qw/vp9_idct8x8_64_add sse2 neon dspr2/, "$ssse3_x86_64";
 
     add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-    $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+    specialize qw/vp9_idct8x8_12_add sse2 neon dspr2/, "$ssse3_x86_64";
 
     add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
-    $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+    specialize qw/vp9_idct16x16_1_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-    $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+    specialize qw/vp9_idct16x16_256_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-    $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+    specialize qw/vp9_idct16x16_10_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
-    $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+    specialize qw/vp9_idct32x32_1024_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+    specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2 msa/;
+    #is this a typo?
     $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
 
     add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-    specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
-    $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+    specialize qw/vp9_idct32x32_1_add sse2 neon dspr2 msa/;
 
     add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-    $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-    $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+    specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
 
     # dct and add
 
@@ -528,7 +499,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_d153_predictor_4x4/;
 
   add_proto qw/void vp9_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vp9_highbd_v_predictor_4x4 neon/, "$sse_x86inc";
+  specialize qw/vp9_highbd_v_predictor_4x4/, "$sse_x86inc";
 
   add_proto qw/void vp9_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_highbd_tm_predictor_4x4/, "$sse_x86inc";
@@ -606,7 +577,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_d153_predictor_16x16/;
 
   add_proto qw/void vp9_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vp9_highbd_v_predictor_16x16 neon/, "$sse2_x86inc";
+  specialize qw/vp9_highbd_v_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_highbd_tm_predictor_16x16/, "$sse2_x86_64";
@@ -750,27 +721,12 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vp9_highbd_idct4x4_1_add/;
 
-  add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct4x4_16_add/;
-
   add_proto qw/void vp9_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vp9_highbd_idct8x8_1_add/;
 
-  add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct8x8_64_add/;
-
-  add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct8x8_10_add/;
-
   add_proto qw/void vp9_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vp9_highbd_idct16x16_1_add/;
 
-  add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct16x16_256_add/;
-
-  add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
-  specialize qw/vp9_highbd_idct16x16_10_add/;
-
   add_proto qw/void vp9_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vp9_highbd_idct32x32_1024_add/;
 
@@ -796,6 +752,42 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   add_proto qw/void vp9_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
   specialize qw/vp9_highbd_iwht4x4_16_add/;
+
+  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
+
+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct4x4_16_add/;
+
+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_64_add/;
+
+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_10_add/;
+
+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_256_add/;
+
+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_10_add/;
+
+  } else {
+
+    add_proto qw/void vp9_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct4x4_16_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_64_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct8x8_10_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_256_add sse2/;
+
+    add_proto qw/void vp9_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+    specialize qw/vp9_highbd_idct16x16_10_add sse2/;
+  }
 }
 
 #
@@ -812,16 +804,16 @@ add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int sourc
 specialize qw/vp9_variance16x32/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64/, "$sse2_x86inc";
+specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
@@ -851,7 +843,7 @@ add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_
 specialize qw/vp9_variance4x4/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -930,172 +922,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 
-add_proto qw/unsigned int vp9_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad64x64 neon avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x64 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad64x32 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad32x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad16x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad32x32 neon avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad16x16 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad8x8 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vp9_sad4x8/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-specialize qw/vp9_sad4x4/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x64_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x64_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad64x32_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x16_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x32_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad32x32_avg avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x16_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad16x8_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x16_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x8_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad8x4_avg/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad4x8_avg/, "$sse_x86inc";
-
-add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
-
-add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x3/;
-
-add_proto qw/void vp9_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x3/;
-
-add_proto qw/void vp9_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x3 sse3 ssse3/;
-
-add_proto qw/void vp9_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x8x3 sse3 ssse3/;
-
-add_proto qw/void vp9_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x16x3 sse3/;
-
-add_proto qw/void vp9_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x8x3 sse3/;
-
-add_proto qw/void vp9_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x4x3 sse3/;
-
-add_proto qw/void vp9_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad64x64x8/;
-
-add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad32x32x8/;
-
-add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x16x8 sse4/;
-
-add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x8x8 sse4/;
-
-add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x16x8 sse4/;
-
-add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x8x8 sse4/;
-
-add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x4x8/;
-
-add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x8x8/;
-
-add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x4x8 sse4/;
-
-add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x64x4d sse2 avx2/;
-
-add_proto qw/void vp9_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x64x4d sse2/;
-
-add_proto qw/void vp9_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad64x32x4d sse2/;
-
-add_proto qw/void vp9_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x16x4d sse2/;
-
-add_proto qw/void vp9_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x32x4d sse2/;
-
-add_proto qw/void vp9_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad32x32x4d sse2 avx2/;
-
-add_proto qw/void vp9_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x16x4d sse2/;
-
-add_proto qw/void vp9_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad16x8x4d sse2/;
-
-add_proto qw/void vp9_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x16x4d sse2/;
-
-add_proto qw/void vp9_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x8x4d sse2/;
-
-# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
-add_proto qw/void vp9_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad8x4x4d sse2/;
-
-add_proto qw/void vp9_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x8x4d sse/;
-
-add_proto qw/void vp9_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-specialize qw/vp9_sad4x4x4d sse/;
-
 add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
 specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
 
@@ -1112,11 +938,39 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
 specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
 
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
-specialize qw/vp9_avg_8x8 sse2/;
+specialize qw/vp9_avg_8x8 sse2 neon/;
+
+add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
+specialize qw/vp9_avg_4x4 sse2/;
+
+add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+specialize qw/vp9_minmax_8x8 sse2/;
+
+add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_16x16 sse2/;
+
+add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2/;
+
+add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
+specialize qw/vp9_int_pro_row sse2/;
+
+add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
+specialize qw/vp9_int_pro_col sse2/;
+
+add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";
+specialize qw/vp9_vector_var sse2/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";
   specialize qw/vp9_highbd_avg_8x8/;
+  add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/vp9_highbd_avg_4x4/;
+  add_proto qw/unsigned int vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vp9_highbd_minmax_8x8/;
 }
 
 # ENCODEMB INVOKE
@@ -1138,32 +992,41 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error/;
 
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp/;
 
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b/;
 
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b_32x32/;
+
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant/;
 } else {
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp sse2/;
 
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b sse2/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
 
 #
@@ -1181,43 +1044,43 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4/;
+  specialize qw/vp9_fht4x4 sse2/;
 
   add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8/;
+  specialize qw/vp9_fht8x8 sse2/;
 
   add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16/;
+  specialize qw/vp9_fht16x16 sse2/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4/;
+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
 
   add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4_1/;
+  specialize qw/vp9_fdct4x4_1 sse2/;
 
   add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct4x4/;
+  specialize qw/vp9_fdct4x4 sse2/;
 
   add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8_1/;
+  specialize qw/vp9_fdct8x8_1 sse2/;
 
   add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct8x8/;
+  specialize qw/vp9_fdct8x8 sse2/;
 
   add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16_1/;
+  specialize qw/vp9_fdct16x16_1 sse2/;
 
   add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16/;
+  specialize qw/vp9_fdct16x16 sse2/;
 
   add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_1/;
+  specialize qw/vp9_fdct32x32_1 sse2/;
 
   add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32/;
+  specialize qw/vp9_fdct32x32 sse2/;
 
   add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct32x32_rd/;
+  specialize qw/vp9_fdct32x32_rd sse2/;
 } else {
   add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp9_fht4x4 sse2/;
@@ -1267,9 +1130,6 @@ specialize qw/vp9_full_search_sad sse3 sse4_1/;
 $vp9_full_search_sad_sse3=vp9_full_search_sadx3;
 $vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
 
-add_proto qw/int vp9_refining_search_sad/, "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_refining_search_sad/;
-
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad/;
 
@@ -1283,34 +1143,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # variance
   add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x16/;
+  specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x32/;
+  specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x32/;
+  specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x64/;
+  specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x32/;
+  specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x64/;
+  specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x16/;
+  specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x8/;
+  specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x16/;
+  specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x8/;
+  specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_variance8x4/;
@@ -1322,40 +1182,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_variance4x4/;
 
   add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get8x8var/;
+  specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get16x16var/;
+  specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x16/;
+  specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x32/;
+  specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x32/;
+  specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x64/;
+  specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x32/;
+  specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x64/;
+  specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x16/;
+  specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x8/;
+  specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x16/;
+  specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x8/;
+  specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_10_variance8x4/;
@@ -1367,40 +1227,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_10_variance4x4/;
 
   add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get8x8var/;
+  specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get16x16var/;
+  specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x16/;
+  specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x32/;
+  specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x32/;
+  specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x64/;
+  specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x32/;
+  specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x64/;
+  specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x16/;
+  specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x8/;
+  specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x16/;
+  specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x8/;
+  specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_12_variance8x4/;
@@ -1412,76 +1272,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_12_variance4x4/;
 
   add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get8x8var/;
+  specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get16x16var/;
+  specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_sub_pixel_variance4x8/;
@@ -1496,70 +1356,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
@@ -1574,70 +1434,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/;
+  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/;
+  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
@@ -1651,174 +1511,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
   specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
 
-  add_proto qw/unsigned int vp9_highbd_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad64x64/;
-
-  add_proto qw/unsigned int vp9_highbd_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad32x64/;
-
-  add_proto qw/unsigned int vp9_highbd_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad64x32/;
-
-  add_proto qw/unsigned int vp9_highbd_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad32x16/;
-
-  add_proto qw/unsigned int vp9_highbd_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad16x32/;
-
-  add_proto qw/unsigned int vp9_highbd_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad32x32/;
-
-  add_proto qw/unsigned int vp9_highbd_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad16x16/;
-
-  add_proto qw/unsigned int vp9_highbd_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad16x8/;
-
-  add_proto qw/unsigned int vp9_highbd_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad8x16/;
-
-  add_proto qw/unsigned int vp9_highbd_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad8x8/;
-
-  add_proto qw/unsigned int vp9_highbd_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad8x4/;
-
-  add_proto qw/unsigned int vp9_highbd_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vp9_highbd_sad4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
-  specialize qw/vp9_highbd_sad4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad64x64_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x64_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad64x32_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x16_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x32_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad32x32_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x16_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad16x8_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x16_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x8_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad8x4_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad4x8_avg/;
-
-  add_proto qw/unsigned int vp9_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sad4x4_avg/;
-
-  add_proto qw/void vp9_highbd_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad64x64x3/;
-
-  add_proto qw/void vp9_highbd_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x32x3/;
-
-  add_proto qw/void vp9_highbd_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x16x3/;
-
-  add_proto qw/void vp9_highbd_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x8x3/;
-
-  add_proto qw/void vp9_highbd_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x16x3/;
-
-  add_proto qw/void vp9_highbd_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x8x3/;
-
-  add_proto qw/void vp9_highbd_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad4x4x3/;
-
-  add_proto qw/void vp9_highbd_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad64x64x8/;
-
-  add_proto qw/void vp9_highbd_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad32x32x8/;
-
-  add_proto qw/void vp9_highbd_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad16x16x8/;
-
-  add_proto qw/void vp9_highbd_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad16x8x8/;
-
-  add_proto qw/void vp9_highbd_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad8x16x8/;
-
-  add_proto qw/void vp9_highbd_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad8x8x8/;
-
-  add_proto qw/void vp9_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad8x4x8/;
-
-  add_proto qw/void vp9_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad4x8x8/;
-
-  add_proto qw/void vp9_highbd_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-  specialize qw/vp9_highbd_sad4x4x8/;
-
-  add_proto qw/void vp9_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad64x64x4d/;
-
-  add_proto qw/void vp9_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x64x4d/;
-
-  add_proto qw/void vp9_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad64x32x4d/;
-
-  add_proto qw/void vp9_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x16x4d/;
-
-  add_proto qw/void vp9_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x32x4d/;
-
-  add_proto qw/void vp9_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad32x32x4d/;
-
-  add_proto qw/void vp9_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x16x4d/;
-
-  add_proto qw/void vp9_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad16x8x4d/;
-
-  add_proto qw/void vp9_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x16x4d/;
-
-  add_proto qw/void vp9_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x8x4d/;
-
-  # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
-  add_proto qw/void vp9_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad8x4x4d/;
-
-  add_proto qw/void vp9_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad4x8x4d/;
-
-  add_proto qw/void vp9_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
-  specialize qw/vp9_highbd_sad4x4x4d/;
-
   add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse16x16/;
+  specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vp9_highbd_mse8x16/;
@@ -1827,10 +1521,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_mse16x8/;
 
   add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse8x8/;
+  specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse16x16/;
+  specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vp9_highbd_10_mse8x16/;
@@ -1839,10 +1533,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_10_mse16x8/;
 
   add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse8x8/;
+  specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse16x16/;
+  specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
 
   add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
   specialize qw/vp9_highbd_12_mse8x16/;
@@ -1851,27 +1545,27 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_12_mse16x8/;
 
   add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse8x8/;
+  specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
 
   # ENCODEMB INVOKE
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error/;
+  specialize qw/vp9_highbd_block_error sse2/;
 
   add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
   specialize qw/vp9_highbd_subtract_block/;
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp_32x32/;
 
-  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b/;
+  add_proto qw/void vp9_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b sse2/;
 
-  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_highbd_quantize_b_32x32/;
+  add_proto qw/void vp9_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_b_32x32 sse2/;
 
   #
   # Structured Similarity (SSIM)
@@ -1883,40 +1577,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht4x4/;
+  specialize qw/vp9_highbd_fht4x4 sse2/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht8x8/;
+  specialize qw/vp9_highbd_fht8x8 sse2/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_highbd_fht16x16/;
+  specialize qw/vp9_highbd_fht16x16 sse2/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fwht4x4/;
 
   add_proto qw/void vp9_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct4x4/;
+  specialize qw/vp9_highbd_fdct4x4 sse2/;
 
   add_proto qw/void vp9_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fdct8x8_1/;
 
   add_proto qw/void vp9_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct8x8/;
+  specialize qw/vp9_highbd_fdct8x8 sse2/;
 
   add_proto qw/void vp9_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fdct16x16_1/;
 
   add_proto qw/void vp9_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct16x16/;
+  specialize qw/vp9_highbd_fdct16x16 sse2/;
 
   add_proto qw/void vp9_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_highbd_fdct32x32_1/;
 
   add_proto qw/void vp9_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct32x32/;
+  specialize qw/vp9_highbd_fdct32x32 sse2/;
 
   add_proto qw/void vp9_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_highbd_fdct32x32_rd/;
+  specialize qw/vp9_highbd_fdct32x32_rd sse2/;
 
   add_proto qw/void vp9_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
   specialize qw/vp9_highbd_temporal_filter_apply/;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
index e9711582303..161c381ad08 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_systemdependent.h
@@ -17,8 +17,10 @@
 #  include <intrin.h>
 #  define USE_MSC_INTRIN
 # endif
+#if _MSC_VER < 1900
 # define snprintf _snprintf
 #endif
+#endif
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h
index 864579c03c3..12848fedeff 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread.h
@@ -22,9 +22,13 @@
 extern "C" {
 #endif
 
+// Set maximum decode threads to be 8 due to the limit of frame buffers
+// and not enough semaphores in the emulation layer on windows.
+#define MAX_DECODE_THREADS 8
+
 #if CONFIG_MULTITHREAD
 
-#if defined(_WIN32)
+#if defined(_WIN32) && !HAVE_PTHREAD_H
 #include <errno.h>  // NOLINT
 #include <process.h>  // NOLINT
 #include <windows.h>  // NOLINT
@@ -103,8 +107,8 @@ static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) {
 static INLINE int pthread_cond_init(pthread_cond_t *const condition,
                                     void* cond_attr) {
   (void)cond_attr;
-  condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
-  condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, MAX_DECODE_THREADS, NULL);
   condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
   if (condition->waiting_sem_ == NULL ||
       condition->received_sem_ == NULL ||
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.c
new file mode 100644
index 00000000000..cba57ff41aa
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.c
@@ -0,0 +1,436 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_loopfilter.h"
+
+#if CONFIG_MULTITHREAD
+static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
+  const int kMaxTryLocks = 4000;
+  int locked = 0;
+  int i;
+
+  for (i = 0; i < kMaxTryLocks; ++i) {
+    if (!pthread_mutex_trylock(mutex)) {
+      locked = 1;
+      break;
+    }
+  }
+
+  if (!locked)
+    pthread_mutex_lock(mutex);
+}
+#endif  // CONFIG_MULTITHREAD
+
+static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
+    mutex_lock(mutex);
+
+    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
+      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
+                              const int sb_cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = lf_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < sb_cols - 1) {
+    cur = c;
+    if (c % nsync)
+      sig = 0;
+  } else {
+    cur = sb_cols + nsync;
+  }
+
+  if (sig) {
+    mutex_lock(&lf_sync->mutex_[r]);
+
+    lf_sync->cur_sb_col[r] = cur;
+
+    pthread_cond_signal(&lf_sync->cond_[r]);
+    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+  }
+#else
+  (void)lf_sync;
+  (void)r;
+  (void)c;
+  (void)sb_cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE
+void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer,
+                             VP9_COMMON *const cm,
+                             struct macroblockd_plane planes[MAX_MB_PLANE],
+                             int start, int stop, int y_only,
+                             VP9LfSync *const lf_sync) {
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int mi_row, mi_col;
+  enum lf_path path;
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
+  for (mi_row = start; mi_row < stop;
+       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+    MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+      LOOP_FILTER_MASK lfm;
+      int plane;
+
+      sync_read(lf_sync, r, c);
+
+      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      // TODO(JBB): Make setup_mask work for non 420.
+      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
+                     &lfm);
+
+      vp9_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
+      for (plane = 1; plane < num_planes; ++plane) {
+        switch (path) {
+          case LF_PATH_420:
+            vp9_filter_block_plane_ss11(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_444:
+            vp9_filter_block_plane_ss00(cm, &planes[plane], mi_row, &lfm);
+            break;
+          case LF_PATH_SLOW:
+            vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                          mi_row, mi_col);
+            break;
+        }
+      }
+
+      sync_write(lf_sync, r, c, sb_cols);
+    }
+  }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(VP9LfSync *const lf_sync,
+                                  LFWorkerData *const lf_data) {
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
+  return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame,
+                                VP9_COMMON *cm,
+                                struct macroblockd_plane planes[MAX_MB_PLANE],
+                                int start, int stop, int y_only,
+                                VP9Worker *workers, int nworkers,
+                                VP9LfSync *lf_sync) {
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  // Number of superblock rows and cols
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  // Decoder may allocate more threads than number of tiles based on user's
+  // input.
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = MIN(nworkers, tile_cols);
+  int i;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    vp9_loop_filter_dealloc(lf_sync);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  // Set up loopfilter thread data.
+  // The decoder is capping num_workers because it has been observed that using
+  // more threads on the loopfilter than there are cores will hurt performance
+  // on Android. This is because the system will only schedule the tile decode
+  // workers on cores equal to the number of tile columns. Then if the decoder
+  // tries to use more threads for the loopfilter, it will hurt performance
+  // because of contention. If the multithreading code changes in the future
+  // then the number of workers used by the loopfilter should be revisited.
+  for (i = 0; i < num_workers; ++i) {
+    VP9Worker *const worker = &workers[i];
+    LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+    worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+    worker->data1 = lf_sync;
+    worker->data2 = lf_data;
+
+    // Loopfilter data
+    vp9_loop_filter_data_reset(lf_data, frame, cm, planes);
+    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->stop = stop;
+    lf_data->y_only = y_only;
+
+    // Start loopfiltering
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  // Wait till all rows are finished
+  for (i = 0; i < num_workers; ++i) {
+    winterface->sync(&workers[i]);
+  }
+}
+
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              VP9_COMMON *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VP9Worker *workers, int num_workers,
+                              VP9LfSync *lf_sync) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+  if (!frame_filter_level) return;
+
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = MAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp9_loop_filter_frame_init(cm, frame_filter_level);
+
+  loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row,
+                      y_only, workers, num_workers, lf_sync);
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+  // nsync numbers are picked by testing. For example, for 4k
+  // video, using 4 gives best performance.
+  if (width < 640)
+    return 1;
+  else if (width <= 1280)
+    return 2;
+  else if (width <= 4096)
+    return 4;
+  else
+    return 8;
+}
+
+// Allocate memory for lf row synchronization
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+                           int width, int num_workers) {
+  lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
+                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
+    if (lf_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->cond_,
+                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
+    if (lf_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->cond_[i], NULL);
+      }
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+                  vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
+  lf_sync->num_workers = num_workers;
+
+  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
+
+  // Set up nsync.
+  lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
+  if (lf_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (lf_sync->mutex_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+      }
+      vpx_free(lf_sync->mutex_);
+    }
+    if (lf_sync->cond_ != NULL) {
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_cond_destroy(&lf_sync->cond_[i]);
+      }
+      vpx_free(lf_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    vpx_free(lf_sync->lfdata);
+    vpx_free(lf_sync->cur_sb_col);
+    // clear the structure as the source of this call may be a resize in which
+    // case this call will be followed by an _alloc() which may fail.
+    vp9_zero(*lf_sync);
+  }
+}
+
+// Accumulate frame counts.
+void vp9_accumulate_frame_counts(VP9_COMMON *cm, FRAME_COUNTS *counts,
+                                 int is_dec) {
+  int i, j, k, l, m;
+
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
+
+  for (i = 0; i < PARTITION_CONTEXTS; i++)
+    for (j = 0; j < PARTITION_TYPES; j++)
+      cm->counts.partition[i][j] += counts->partition[i][j];
+
+  if (is_dec) {
+    int n;
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++) {
+              cm->counts.eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+              for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                cm->counts.coef[i][j][k][l][m][n] +=
+                    counts->coef[i][j][k][l][m][n];
+            }
+  } else {
+    for (i = 0; i < TX_SIZES; i++)
+      for (j = 0; j < PLANE_TYPES; j++)
+        for (k = 0; k < REF_TYPES; k++)
+          for (l = 0; l < COEF_BANDS; l++)
+            for (m = 0; m < COEFF_CONTEXTS; m++)
+              cm->counts.eob_branch[i][j][k][l][m] +=
+                  counts->eob_branch[i][j][k][l][m];
+                // In the encoder, cm->counts.coef is only updated at frame
+                // level, so not need to accumulate it here.
+                // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
+                //   cm->counts.coef[i][j][k][l][m][n] +=
+                //       counts->coef[i][j][k][l][m][n];
+  }
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    for (j = 0; j < SWITCHABLE_FILTERS; j++)
+      cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
+
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    for (j = 0; j < INTER_MODES; j++)
+      cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
+
+  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
+
+  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      for (k = 0; k < 2; k++)
+      cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
+
+  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
+    for (j = 0; j < TX_SIZES; j++)
+      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
+
+    for (j = 0; j < TX_SIZES - 1; j++)
+      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
+
+    for (j = 0; j < TX_SIZES - 2; j++)
+      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
+  }
+
+  for (i = 0; i < TX_SIZES; i++)
+    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
+
+  for (i = 0; i < SKIP_CONTEXTS; i++)
+    for (j = 0; j < 2; j++)
+      cm->counts.skip[i][j] += counts->skip[i][j];
+
+  for (i = 0; i < MV_JOINTS; i++)
+    cm->counts.mv.joints[i] += counts->mv.joints[i];
+
+  for (k = 0; k < 2; k++) {
+    nmv_component_counts *comps = &cm->counts.mv.comps[k];
+    nmv_component_counts *comps_t = &counts->mv.comps[k];
+
+    for (i = 0; i < 2; i++) {
+      comps->sign[i] += comps_t->sign[i];
+      comps->class0_hp[i] += comps_t->class0_hp[i];
+      comps->hp[i] += comps_t->hp[i];
+    }
+
+    for (i = 0; i < MV_CLASSES; i++)
+      comps->classes[i] += comps_t->classes[i];
+
+    for (i = 0; i < CLASS0_SIZE; i++) {
+      comps->class0[i] += comps_t->class0[i];
+      for (j = 0; j < MV_FP_SIZE; j++)
+        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
+    }
+
+    for (i = 0; i < MV_OFFSET_BITS; i++)
+      for (j = 0; j < 2; j++)
+        comps->bits[i][j] += comps_t->bits[i][j];
+
+    for (i = 0; i < MV_FP_SIZE; i++)
+      comps->fp[i] += comps_t->fp[i];
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h
new file mode 100644
index 00000000000..3b3a6996ae9
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_thread_common.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
+#include "./vpx_config.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_thread.h"
+
+struct VP9Common;
+struct FRAME_COUNTS;
+
+// Loopfilter row synchronization
+typedef struct VP9LfSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the loop-filtered superblock index in each row.
+  int *cur_sb_col;
+  // The optimal sync_range for different resolution and platform should be
+  // determined by testing. Currently, it is chosen to be a power-of-2 number.
+  int sync_range;
+  int rows;
+
+  // Row-based parallel loopfilter data
+  LFWorkerData *lfdata;
+  int num_workers;
+} VP9LfSync;
+
+// Allocate memory for loopfilter row synchronization.
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows,
+                           int width, int num_workers);
+
+// Deallocate loopfilter synchronization related mutex and data.
+void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
+
+// Multi-threaded loopfilter that uses the tile threads.
+void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+                              struct VP9Common *cm,
+                              struct macroblockd_plane planes[MAX_MB_PLANE],
+                              int frame_filter_level,
+                              int y_only, int partial_frame,
+                              VP9Worker *workers, int num_workers,
+                              VP9LfSync *lf_sync);
+
+void vp9_accumulate_frame_counts(struct VP9Common *cm,
+                                 struct FRAME_COUNTS *counts, int is_dec);
+
+#endif  // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
index 8c4a30353c1..7a20e0a9e73 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_tile_common.c
@@ -36,24 +36,24 @@ void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
   vp9_tile_set_col(tile, cm, col);
 }
 
-void vp9_get_tile_n_bits(int mi_cols,
-                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
-  int min_log2 = 0, max_log2 = 0;
-
-  // max
-  while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
-    ++max_log2;
-  --max_log2;
-  if (max_log2 < 0)
-    max_log2 = 0;
-
-  // min
-  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
+static int get_min_log2_tile_cols(const int sb64_cols) {
+  int min_log2 = 0;
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
     ++min_log2;
+  return min_log2;
+}
 
-  assert(min_log2 <= max_log2);
+static int get_max_log2_tile_cols(const int sb64_cols) {
+  int max_log2 = 1;
+  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  return max_log2 - 1;
+}
 
-  *min_log2_tile_cols = min_log2;
-  *max_log2_tile_cols = max_log2;
+void vp9_get_tile_n_bits(int mi_cols,
+                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
+  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+  assert(*min_log2_tile_cols <= *max_log2_tile_cols);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
index a0a599691c8..963023c53b1 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_asm_stubs.c
@@ -118,7 +118,7 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); \
+      DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \
       vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
                                 filter_x, x_step_q4, filter_y, y_step_q4, \
                                 w, h + 7); \
@@ -126,7 +126,7 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                       filter_x, x_step_q4, filter_y, \
                                       y_step_q4, w, h); \
     } else { \
-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 65); \
+      DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \
       vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
                                 filter_x, x_step_q4, filter_y, y_step_q4, \
                                 w, h + 1); \
@@ -259,7 +259,7 @@ void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
     if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
         filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 71); \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
       vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), 64, \
                                        filter_x, x_step_q4, \
@@ -271,7 +271,7 @@ void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                                              filter_y, y_step_q4, \
                                              w, h, bd); \
     } else { \
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, fdata2, 64 * 65); \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
       vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
                                        CONVERT_TO_BYTEPTR(fdata2), 64, \
                                        filter_x, x_step_q4, \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
index 721126c7825..b12d29c0ad8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_intrapred_sse2.asm
@@ -345,7 +345,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
 
 %if ARCH_X86_64
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
index 7e63f389ead..1637f0e545a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c
@@ -15,24 +15,38 @@
 #include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
-    __m128i ubounded;
-    __m128i lbounded;
-    __m128i retval;
-
-    const __m128i zero = _mm_set1_epi16(0);
-    const __m128i one = _mm_set1_epi16(1);
-    const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
-    const __m128i max = _mm_subs_epi16(
-        _mm_subs_epi16(_mm_slli_epi16(one, bd), one), t80);
-    const __m128i min = _mm_subs_epi16(zero, t80);
-    ubounded = _mm_cmpgt_epi16(value, max);
-    lbounded = _mm_cmplt_epi16(value, min);
-    retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
-    ubounded = _mm_and_si128(ubounded, max);
-    lbounded = _mm_and_si128(lbounded, min);
-    retval = _mm_or_si128(retval, ubounded);
-    retval = _mm_or_si128(retval, lbounded);
-    return retval;
+  __m128i ubounded;
+  __m128i lbounded;
+  __m128i retval;
+
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i t80, max, min;
+
+  if (bd == 8) {
+    t80 = _mm_set1_epi16(0x80);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
+  } else if (bd == 10) {
+    t80 = _mm_set1_epi16(0x200);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
+  } else {  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+    max = _mm_subs_epi16(
+              _mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
+  }
+
+  min = _mm_subs_epi16(zero, t80);
+
+  ubounded = _mm_cmpgt_epi16(value, max);
+  lbounded = _mm_cmplt_epi16(value, min);
+  retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
+  ubounded = _mm_and_si128(ubounded, max);
+  lbounded = _mm_and_si128(lbounded, min);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_or_si128(retval, lbounded);
+  return retval;
 }
 
 // TODO(debargha, peter): Break up large functions into smaller ones
@@ -45,14 +59,7 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
                                                    int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i blimit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
-  const __m128i limit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), bd - 8);
-  const __m128i thresh = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+  __m128i blimit, limit, thresh;
   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
   __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
   __m128i ps1, qs1, ps0, qs0;
@@ -68,6 +75,26 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
   __m128i t4, t3, t80, t1;
   __m128i eight, four;
 
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+  }
+
   q4 = _mm_load_si128((__m128i *)(s + 4 * p));
   p4 = _mm_load_si128((__m128i *)(s - 5 * p));
   q3 = _mm_load_si128((__m128i *)(s + 3 * p));
@@ -121,7 +148,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
   // highbd_filter4
   t4 = _mm_set1_epi16(4);
   t3 = _mm_set1_epi16(3);
-  t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  if (bd == 8)
+    t80 = _mm_set1_epi16(0x80);
+  else if (bd == 10)
+    t80 = _mm_set1_epi16(0x200);
+  else  // bd == 12
+    t80 = _mm_set1_epi16(0x800);
+
   t1 = _mm_set1_epi16(0x1);
 
   ps1 = _mm_subs_epi16(p1, t80);
@@ -136,7 +169,6 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
   filt = _mm_adds_epi16(filt, work_a);
   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
   filt = _mm_and_si128(filt, mask);
-
   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
   filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
 
@@ -153,13 +185,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
   filt = _mm_adds_epi16(filter1, t1);
   filt = _mm_srai_epi16(filt, 1);
   filt = _mm_andnot_si128(hev, filt);
-
   qs1 = _mm_adds_epi16(
       signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
       t80);
   ps1 = _mm_adds_epi16(
       signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
       t80);
+
   // end highbd_filter4
   // loopfilter done
 
@@ -175,7 +207,14 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
   flat = _mm_max_epi16(work, flat);
   work = _mm_max_epi16(abs_p1p0, abs_q1q0);
   flat = _mm_max_epi16(work, flat);
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
   flat = _mm_cmpeq_epi16(flat, zero);
   // end flat_mask4
 
@@ -215,7 +254,13 @@ static void highbd_mb_lpf_horizontal_edge_w_sse2_8(uint16_t *s,
                                     _mm_subs_epu16(q0, q7)));
   flat2 = _mm_max_epi16(work, flat2);
 
-  flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, bd - 8));
+  if (bd == 8)
+    flat2 = _mm_subs_epu16(flat2, one);
+  else if (bd == 10)
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
+
   flat2 = _mm_cmpeq_epi16(flat2, zero);
   flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
   // end highbd_flat_mask5
@@ -479,22 +524,14 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
                                       const uint8_t *_limit,
                                       const uint8_t *_thresh,
                                       int count, int bd) {
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op2, 16);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op1, 16);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_op0, 16);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq2, 16);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq1, 16);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, flat_oq0, 16);
+  DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
+  DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero),
-      bd - 8);
-  const __m128i limit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero),
-      bd - 8);
-  const __m128i thresh = _mm_slli_epi16(
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero),
-      bd - 8);
+  __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
   __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
@@ -512,18 +549,43 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
 
   const __m128i t4 = _mm_set1_epi16(4);
   const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
+  __m128i t80;
   const __m128i t1 = _mm_set1_epi16(0x1);
-  const __m128i ps1 = _mm_subs_epi16(p1, t80);
-  const __m128i ps0 = _mm_subs_epi16(p0, t80);
-  const __m128i qs0 = _mm_subs_epi16(q0, t80);
-  const __m128i qs1 = _mm_subs_epi16(q1, t80);
+  __m128i ps1, ps0, qs0, qs1;
   __m128i filt;
   __m128i work_a;
   __m128i filter1, filter2;
 
   (void)count;
 
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_set1_epi16(0x200);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+          _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_set1_epi16(0x800);
+  }
+
+  ps1 = _mm_subs_epi16(p1, t80);
+  ps0 = _mm_subs_epi16(p0, t80);
+  qs0 = _mm_subs_epi16(q0, t80);
+  qs1 = _mm_subs_epi16(q1, t80);
+
   // filter_mask and hev_mask
   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0),
                           _mm_subs_epu16(p0, p1));
@@ -575,7 +637,14 @@ void vp9_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   flat = _mm_max_epi16(work, flat);
   flat = _mm_max_epi16(abs_p1p0, flat);
   flat = _mm_max_epi16(abs_q1q0, flat);
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  if (bd == 8)
+    flat = _mm_subs_epu16(flat, one);
+  else if (bd == 10)
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
+  else  // bd == 12
+    flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
+
   flat = _mm_cmpeq_epi16(flat, zero);
   flat = _mm_and_si128(flat, mask);  // flat & mask
 
@@ -706,15 +775,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
                                       const uint8_t *_thresh,
                                       int count, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_blimit), zero), bd - 8);
-  const __m128i limit = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_limit), zero), bd - 8);
-  const __m128i thresh = _mm_slli_epi16(
-      _mm_unpacklo_epi8(
-          _mm_load_si128((const __m128i *)_thresh), zero), bd - 8);
+  __m128i blimit, limit, thresh;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
@@ -737,30 +798,63 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i work;
   const __m128i t4 = _mm_set1_epi16(4);
   const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8);
-  const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8);
-  const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8);
-  const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd);
+  __m128i t80;
+  __m128i tff80;
+  __m128i tffe0;
+  __m128i t1f;
   // equivalent to shifting 0x1f left by bitdepth - 8
   // and setting new bits to 1
   const __m128i t1 = _mm_set1_epi16(0x1);
-  const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd);
+  __m128i t7f;
   // equivalent to shifting 0x7f left by bitdepth - 8
   // and setting new bits to 1
-  const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)),
-                                     t80);
-  const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)),
-                                     t80);
-  const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)),
-                                     t80);
-  const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)),
-                                     t80);
+  __m128i ps1, ps0, qs0, qs1;
   __m128i filt;
   __m128i work_a;
   __m128i filter1, filter2;
 
   (void)count;
 
+  if (bd == 8) {
+    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    t80 = _mm_set1_epi16(0x80);
+    tff80 = _mm_set1_epi16(0xff80);
+    tffe0 = _mm_set1_epi16(0xffe0);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+  } else if (bd == 10) {
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+  } else {  // bd == 12
+    blimit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+    limit = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+    thresh = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+    t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+    t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+  }
+
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
   hev = _mm_subs_epu16(flat, thresh);
@@ -796,6 +890,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   filt = _mm_adds_epi16(filt, work_a);
   filt = _mm_adds_epi16(filt, work_a);
   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
+
   // (vp9_filter + 3 * (qs0 - ps0)) & mask
   filt = _mm_and_si128(filt, mask);
 
@@ -964,7 +1059,7 @@ void vp9_highbd_lpf_vertical_4_sse2(uint16_t *s, int p,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
                                     int count, int bd) {
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
   (void)count;
@@ -994,7 +1089,7 @@ void vp9_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *limit1,
                                          const uint8_t *thresh1,
                                          int bd) {
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
   uint16_t *src[2];
   uint16_t *dst[2];
 
@@ -1018,7 +1113,7 @@ void vp9_highbd_lpf_vertical_8_sse2(uint16_t *s, int p,
                                     const uint8_t *limit,
                                     const uint8_t *thresh,
                                     int count, int bd) {
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 8);
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
   (void)count;
@@ -1048,7 +1143,7 @@ void vp9_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int p,
                                          const uint8_t *limit1,
                                          const uint8_t *thresh1,
                                          int bd) {
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 16 * 8);
+  DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
   uint16_t *src[2];
   uint16_t *dst[2];
 
@@ -1073,7 +1168,7 @@ void vp9_highbd_lpf_vertical_16_sse2(uint16_t *s, int p,
                                      const uint8_t *limit,
                                      const uint8_t *thresh,
                                      int bd) {
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 8 * 16);
+  DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
   uint16_t *src[2];
   uint16_t *dst[2];
 
@@ -1103,7 +1198,7 @@ void vp9_highbd_lpf_vertical_16_dual_sse2(uint16_t *s,
                                           const uint8_t *limit,
                                           const uint8_t *thresh,
                                           int bd) {
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, t_dst, 256);
+  DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
 
   //  Transpose 16x16
   highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index df609872b75..0385c7955c9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -9,6 +9,7 @@
  */
 
 #include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+#include "vp9/common/vp9_idct.h"
 
 #define RECON_AND_STORE4X4(dest, in_x) \
 {                                                     \
@@ -16,17 +17,16 @@
   d0 = _mm_unpacklo_epi8(d0, zero); \
   d0 = _mm_add_epi16(in_x, d0); \
   d0 = _mm_packus_epi16(d0, d0); \
-  *(int *)dest = _mm_cvtsi128_si32(d0); \
-  dest += stride; \
+  *(int *)(dest) = _mm_cvtsi128_si32(d0); \
 }
 
 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
-  const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
-                                    (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
-                                    (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
-                                    (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+  const __m128i cst = _mm_setr_epi16(
+      (int16_t)cospi_16_64, (int16_t)cospi_16_64, (int16_t)cospi_16_64,
+      (int16_t)-cospi_16_64, (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+      (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i input0, input1, input2, input3;
 
@@ -125,28 +125,28 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   // Reconstruction and Store
   {
-     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-     d0 = _mm_unpacklo_epi32(d0,
-          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
-     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
-                    *(const int *) (dest + stride * 3)), d2);
-     d0 = _mm_unpacklo_epi8(d0, zero);
-     d2 = _mm_unpacklo_epi8(d2, zero);
-     d0 = _mm_add_epi16(d0, input2);
-     d2 = _mm_add_epi16(d2, input3);
-     d0 = _mm_packus_epi16(d0, d2);
-     // store input0
-     *(int *)dest = _mm_cvtsi128_si32(d0);
-     // store input1
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-     // store input2
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
-     // store input3
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)), d2);
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, input2);
+    d2 = _mm_add_epi16(d2, input3);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store input0
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store input1
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store input2
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    // store input3
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
   }
 }
 
@@ -161,10 +161,10 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   dc_value = _mm_set1_epi16(a);
 
-  RECON_AND_STORE4X4(dest, dc_value);
-  RECON_AND_STORE4X4(dest, dc_value);
-  RECON_AND_STORE4X4(dest, dc_value);
-  RECON_AND_STORE4X4(dest, dc_value);
+  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 1 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 2 * stride, dc_value);
+  RECON_AND_STORE4X4(dest + 3 * stride, dc_value);
 }
 
 static INLINE void transpose_4x4(__m128i *res) {
@@ -216,7 +216,7 @@ static void iadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
   const __m128i kZero = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8], in7;
@@ -266,8 +266,8 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0]= _mm_loadu_si128((const __m128i *)(input));
-  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
+  in[0] = _mm_loadu_si128((const __m128i *)(input));
+  in[1] = _mm_loadu_si128((const __m128i *)(input + 8));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -300,28 +300,28 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 
   // Reconstruction and Store
   {
-     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
-     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-     d0 = _mm_unpacklo_epi32(d0,
-          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
-     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
-                    *(const int *) (dest + stride * 3)));
-     d0 = _mm_unpacklo_epi8(d0, zero);
-     d2 = _mm_unpacklo_epi8(d2, zero);
-     d0 = _mm_add_epi16(d0, in[0]);
-     d2 = _mm_add_epi16(d2, in[1]);
-     d0 = _mm_packus_epi16(d0, d2);
-     // store result[0]
-     *(int *)dest = _mm_cvtsi128_si32(d0);
-     // store result[1]
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
-     // store result[2]
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
-     // store result[3]
-     d0 = _mm_srli_si128(d0, 4);
-     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+    d0 = _mm_unpacklo_epi32(d0,
+                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
+    d2 = _mm_unpacklo_epi32(
+        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    d0 = _mm_unpacklo_epi8(d0, zero);
+    d2 = _mm_unpacklo_epi8(d2, zero);
+    d0 = _mm_add_epi16(d0, in[0]);
+    d2 = _mm_add_epi16(d2, in[1]);
+    d0 = _mm_packus_epi16(d0, d2);
+    // store result[0]
+    *(int *)dest = _mm_cvtsi128_si32(d0);
+    // store result[1]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+    // store result[2]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+    // store result[3]
+    d0 = _mm_srli_si128(d0, 4);
+    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
   }
 }
 
@@ -516,7 +516,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
@@ -550,7 +550,7 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
     // 4-stage 1D idct8x8
     IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-             in0, in1, in2, in3, in4, in5, in6, in7);
+          in0, in1, in2, in3, in4, in5, in6, in7);
   }
 
   // Final rounding and shift
@@ -572,14 +572,14 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   in6 = _mm_srai_epi16(in6, 5);
   in7 = _mm_srai_epi16(in7, 5);
 
-  RECON_AND_STORE(dest, in0);
-  RECON_AND_STORE(dest, in1);
-  RECON_AND_STORE(dest, in2);
-  RECON_AND_STORE(dest, in3);
-  RECON_AND_STORE(dest, in4);
-  RECON_AND_STORE(dest, in5);
-  RECON_AND_STORE(dest, in6);
-  RECON_AND_STORE(dest, in7);
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
 }
 
 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
@@ -593,14 +593,14 @@ void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   dc_value = _mm_set1_epi16(a);
 
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
-  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest + 0 * stride, dc_value);
+  RECON_AND_STORE(dest + 1 * stride, dc_value);
+  RECON_AND_STORE(dest + 2 * stride, dc_value);
+  RECON_AND_STORE(dest + 3 * stride, dc_value);
+  RECON_AND_STORE(dest + 4 * stride, dc_value);
+  RECON_AND_STORE(dest + 5 * stride, dc_value);
+  RECON_AND_STORE(dest + 6 * stride, dc_value);
+  RECON_AND_STORE(dest + 7 * stride, dc_value);
 }
 
 static void idct8_sse2(__m128i *in) {
@@ -625,7 +625,7 @@ static void idct8_sse2(__m128i *in) {
 
   // 4-stage 1D idct8x8
   IDCT8(in0, in1, in2, in3, in4, in5, in6, in7,
-           in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
+        in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7]);
 }
 
 static void iadst8_sse2(__m128i *in) {
@@ -641,7 +641,7 @@ static void iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -655,14 +655,14 @@ static void iadst8_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 
   // properly aligned for butterfly input
-  in0  = in[7];
-  in1  = in[0];
-  in2  = in[5];
-  in3  = in[2];
-  in4  = in[3];
-  in5  = in[4];
-  in6  = in[1];
-  in7  = in[6];
+  in0 = in[7];
+  in1 = in[0];
+  in2 = in[5];
+  in3 = in[2];
+  in4 = in[3];
+  in5 = in[4];
+  in6 = in[1];
+  in7 = in[6];
 
   // column transformation
   // stage 1
@@ -856,12 +856,11 @@ static void iadst8_sse2(__m128i *in) {
   in[7] = _mm_sub_epi16(k__const_0, s1);
 }
 
-
 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[8];
   const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
   in[0] = _mm_load_si128((const __m128i *)input);
@@ -914,20 +913,20 @@ void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   in[6] = _mm_srai_epi16(in[6], 5);
   in[7] = _mm_srai_epi16(in[7], 5);
 
-  RECON_AND_STORE(dest, in[0]);
-  RECON_AND_STORE(dest, in[1]);
-  RECON_AND_STORE(dest, in[2]);
-  RECON_AND_STORE(dest, in[3]);
-  RECON_AND_STORE(dest, in[4]);
-  RECON_AND_STORE(dest, in[5]);
-  RECON_AND_STORE(dest, in[6]);
-  RECON_AND_STORE(dest, in[7]);
+  RECON_AND_STORE(dest + 0 * stride, in[0]);
+  RECON_AND_STORE(dest + 1 * stride, in[1]);
+  RECON_AND_STORE(dest + 2 * stride, in[2]);
+  RECON_AND_STORE(dest + 3 * stride, in[3]);
+  RECON_AND_STORE(dest + 4 * stride, in[4]);
+  RECON_AND_STORE(dest + 5 * stride, in[5]);
+  RECON_AND_STORE(dest + 6 * stride, in[6]);
+  RECON_AND_STORE(dest + 7 * stride, in[7]);
 }
 
 void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<4);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
   const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
   const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
@@ -952,7 +951,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   // 8x4 Transpose
   TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
   // Stage1
-  { //NOLINT
+  {
     const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
     const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
 
@@ -975,7 +974,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   }
 
   // Stage2
-  { //NOLINT
+  {
     const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
     const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
 
@@ -1005,7 +1004,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   }
 
   // Stage3
-  { //NOLINT
+  {
     const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
 
     tmp4 = _mm_adds_epi16(stp2_0, stp2_2);
@@ -1034,7 +1033,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
 
   IDCT8(in0, in1, in2, in3, zero, zero, zero, zero,
-           in0, in1, in2, in3, in4, in5, in6, in7);
+        in0, in1, in2, in3, in4, in5, in6, in7);
   // Final rounding and shift
   in0 = _mm_adds_epi16(in0, final_rounding);
   in1 = _mm_adds_epi16(in1, final_rounding);
@@ -1054,14 +1053,14 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   in6 = _mm_srai_epi16(in6, 5);
   in7 = _mm_srai_epi16(in7, 5);
 
-  RECON_AND_STORE(dest, in0);
-  RECON_AND_STORE(dest, in1);
-  RECON_AND_STORE(dest, in2);
-  RECON_AND_STORE(dest, in3);
-  RECON_AND_STORE(dest, in4);
-  RECON_AND_STORE(dest, in5);
-  RECON_AND_STORE(dest, in6);
-  RECON_AND_STORE(dest, in7);
+  RECON_AND_STORE(dest + 0 * stride, in0);
+  RECON_AND_STORE(dest + 1 * stride, in1);
+  RECON_AND_STORE(dest + 2 * stride, in2);
+  RECON_AND_STORE(dest + 3 * stride, in3);
+  RECON_AND_STORE(dest + 4 * stride, in4);
+  RECON_AND_STORE(dest + 5 * stride, in5);
+  RECON_AND_STORE(dest + 6 * stride, in6);
+  RECON_AND_STORE(dest + 7 * stride, in7);
 }
 
 #define IDCT16 \
@@ -1304,7 +1303,7 @@ void vp9_idct8x8_12_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
                                 int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
   const __m128i zero = _mm_setzero_si128();
 
   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -1343,130 +1342,86 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
 
   curr1 = l;
   for (i = 0; i < 2; i++) {
-      // 1-D idct
-
-      // Load input data.
-      in[0] = _mm_load_si128((const __m128i *)input);
-      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
-      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
-      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
-      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
-      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
-      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
-      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
-      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
-      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
-      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
-      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
-      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
-      array_transpose_8x8(in, in);
-      array_transpose_8x8(in+8, in+8);
-
-      IDCT16
-
-      // Stage7
-      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-      curr1 = r;
-      input += 128;
+    // 1-D idct
+
+    // Load input data.
+    in[0] = _mm_load_si128((const __m128i *)input);
+    in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+    in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+    in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+    in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+    in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+    in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+    in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+    in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+    in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+    in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+    in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+    in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+    in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+    in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+    in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+
+    IDCT16
+
+    // Stage7
+    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    curr1 = r;
+    input += 128;
   }
   for (i = 0; i < 2; i++) {
-      // 1-D idct
-      array_transpose_8x8(l+i*8, in);
-      array_transpose_8x8(r+i*8, in+8);
-
-      IDCT16
-
-      // 2-D
-      in[0] = _mm_add_epi16(stp2_0, stp1_15);
-      in[1] = _mm_add_epi16(stp2_1, stp1_14);
-      in[2] = _mm_add_epi16(stp2_2, stp2_13);
-      in[3] = _mm_add_epi16(stp2_3, stp2_12);
-      in[4] = _mm_add_epi16(stp2_4, stp2_11);
-      in[5] = _mm_add_epi16(stp2_5, stp2_10);
-      in[6] = _mm_add_epi16(stp2_6, stp1_9);
-      in[7] = _mm_add_epi16(stp2_7, stp1_8);
-      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+    int j;
+    // 1-D idct
+    array_transpose_8x8(l + i * 8, in);
+    array_transpose_8x8(r + i * 8, in + 8);
+
+    IDCT16
 
+    // 2-D
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+    for (j = 0; j < 16; ++j) {
       // Final rounding and shift
-      in[0] = _mm_adds_epi16(in[0], final_rounding);
-      in[1] = _mm_adds_epi16(in[1], final_rounding);
-      in[2] = _mm_adds_epi16(in[2], final_rounding);
-      in[3] = _mm_adds_epi16(in[3], final_rounding);
-      in[4] = _mm_adds_epi16(in[4], final_rounding);
-      in[5] = _mm_adds_epi16(in[5], final_rounding);
-      in[6] = _mm_adds_epi16(in[6], final_rounding);
-      in[7] = _mm_adds_epi16(in[7], final_rounding);
-      in[8] = _mm_adds_epi16(in[8], final_rounding);
-      in[9] = _mm_adds_epi16(in[9], final_rounding);
-      in[10] = _mm_adds_epi16(in[10], final_rounding);
-      in[11] = _mm_adds_epi16(in[11], final_rounding);
-      in[12] = _mm_adds_epi16(in[12], final_rounding);
-      in[13] = _mm_adds_epi16(in[13], final_rounding);
-      in[14] = _mm_adds_epi16(in[14], final_rounding);
-      in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-      in[0] = _mm_srai_epi16(in[0], 6);
-      in[1] = _mm_srai_epi16(in[1], 6);
-      in[2] = _mm_srai_epi16(in[2], 6);
-      in[3] = _mm_srai_epi16(in[3], 6);
-      in[4] = _mm_srai_epi16(in[4], 6);
-      in[5] = _mm_srai_epi16(in[5], 6);
-      in[6] = _mm_srai_epi16(in[6], 6);
-      in[7] = _mm_srai_epi16(in[7], 6);
-      in[8] = _mm_srai_epi16(in[8], 6);
-      in[9] = _mm_srai_epi16(in[9], 6);
-      in[10] = _mm_srai_epi16(in[10], 6);
-      in[11] = _mm_srai_epi16(in[11], 6);
-      in[12] = _mm_srai_epi16(in[12], 6);
-      in[13] = _mm_srai_epi16(in[13], 6);
-      in[14] = _mm_srai_epi16(in[14], 6);
-      in[15] = _mm_srai_epi16(in[15], 6);
-
-      RECON_AND_STORE(dest, in[0]);
-      RECON_AND_STORE(dest, in[1]);
-      RECON_AND_STORE(dest, in[2]);
-      RECON_AND_STORE(dest, in[3]);
-      RECON_AND_STORE(dest, in[4]);
-      RECON_AND_STORE(dest, in[5]);
-      RECON_AND_STORE(dest, in[6]);
-      RECON_AND_STORE(dest, in[7]);
-      RECON_AND_STORE(dest, in[8]);
-      RECON_AND_STORE(dest, in[9]);
-      RECON_AND_STORE(dest, in[10]);
-      RECON_AND_STORE(dest, in[11]);
-      RECON_AND_STORE(dest, in[12]);
-      RECON_AND_STORE(dest, in[13]);
-      RECON_AND_STORE(dest, in[14]);
-      RECON_AND_STORE(dest, in[15]);
-
-      dest += 8 - (stride * 16);
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
   }
 }
 
@@ -1482,23 +1437,23 @@ void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   dc_value = _mm_set1_epi16(a);
 
   for (i = 0; i < 2; ++i) {
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    dest += 8 - (stride * 16);
+    RECON_AND_STORE(dest +  0 * stride, dc_value);
+    RECON_AND_STORE(dest +  1 * stride, dc_value);
+    RECON_AND_STORE(dest +  2 * stride, dc_value);
+    RECON_AND_STORE(dest +  3 * stride, dc_value);
+    RECON_AND_STORE(dest +  4 * stride, dc_value);
+    RECON_AND_STORE(dest +  5 * stride, dc_value);
+    RECON_AND_STORE(dest +  6 * stride, dc_value);
+    RECON_AND_STORE(dest +  7 * stride, dc_value);
+    RECON_AND_STORE(dest +  8 * stride, dc_value);
+    RECON_AND_STORE(dest +  9 * stride, dc_value);
+    RECON_AND_STORE(dest + 10 * stride, dc_value);
+    RECON_AND_STORE(dest + 11 * stride, dc_value);
+    RECON_AND_STORE(dest + 12 * stride, dc_value);
+    RECON_AND_STORE(dest + 13 * stride, dc_value);
+    RECON_AND_STORE(dest + 14 * stride, dc_value);
+    RECON_AND_STORE(dest + 15 * stride, dc_value);
+    dest += 8;
   }
 }
 
@@ -1530,8 +1485,8 @@ static void iadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1985,7 +1940,7 @@ static void idct16_8col(__m128i *in) {
   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
@@ -2366,7 +2321,7 @@ void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
                                int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
   const __m128i zero = _mm_setzero_si128();
 
   const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
@@ -2405,7 +2360,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   // Stage2
   {
     const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 =  _mm_unpackhi_epi16(zero, in[1]);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
 
     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
@@ -2566,7 +2521,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   // Second 1-D inverse transform, performed per 8x16 block
   for (i = 0; i < 2; i++) {
-    array_transpose_4X8(l + 8*i, in);
+    int j;
+    array_transpose_4X8(l + 8 * i, in);
 
     IDCT16_10
 
@@ -2588,59 +2544,14 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     in[14] = _mm_sub_epi16(stp2_1, stp1_14);
     in[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
-    // Final rounding and shift
-    in[0] = _mm_adds_epi16(in[0], final_rounding);
-    in[1] = _mm_adds_epi16(in[1], final_rounding);
-    in[2] = _mm_adds_epi16(in[2], final_rounding);
-    in[3] = _mm_adds_epi16(in[3], final_rounding);
-    in[4] = _mm_adds_epi16(in[4], final_rounding);
-    in[5] = _mm_adds_epi16(in[5], final_rounding);
-    in[6] = _mm_adds_epi16(in[6], final_rounding);
-    in[7] = _mm_adds_epi16(in[7], final_rounding);
-    in[8] = _mm_adds_epi16(in[8], final_rounding);
-    in[9] = _mm_adds_epi16(in[9], final_rounding);
-    in[10] = _mm_adds_epi16(in[10], final_rounding);
-    in[11] = _mm_adds_epi16(in[11], final_rounding);
-    in[12] = _mm_adds_epi16(in[12], final_rounding);
-    in[13] = _mm_adds_epi16(in[13], final_rounding);
-    in[14] = _mm_adds_epi16(in[14], final_rounding);
-    in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-    in[0] = _mm_srai_epi16(in[0], 6);
-    in[1] = _mm_srai_epi16(in[1], 6);
-    in[2] = _mm_srai_epi16(in[2], 6);
-    in[3] = _mm_srai_epi16(in[3], 6);
-    in[4] = _mm_srai_epi16(in[4], 6);
-    in[5] = _mm_srai_epi16(in[5], 6);
-    in[6] = _mm_srai_epi16(in[6], 6);
-    in[7] = _mm_srai_epi16(in[7], 6);
-    in[8] = _mm_srai_epi16(in[8], 6);
-    in[9] = _mm_srai_epi16(in[9], 6);
-    in[10] = _mm_srai_epi16(in[10], 6);
-    in[11] = _mm_srai_epi16(in[11], 6);
-    in[12] = _mm_srai_epi16(in[12], 6);
-    in[13] = _mm_srai_epi16(in[13], 6);
-    in[14] = _mm_srai_epi16(in[14], 6);
-    in[15] = _mm_srai_epi16(in[15], 6);
-
-    RECON_AND_STORE(dest, in[0]);
-    RECON_AND_STORE(dest, in[1]);
-    RECON_AND_STORE(dest, in[2]);
-    RECON_AND_STORE(dest, in[3]);
-    RECON_AND_STORE(dest, in[4]);
-    RECON_AND_STORE(dest, in[5]);
-    RECON_AND_STORE(dest, in[6]);
-    RECON_AND_STORE(dest, in[7]);
-    RECON_AND_STORE(dest, in[8]);
-    RECON_AND_STORE(dest, in[9]);
-    RECON_AND_STORE(dest, in[10]);
-    RECON_AND_STORE(dest, in[11]);
-    RECON_AND_STORE(dest, in[12]);
-    RECON_AND_STORE(dest, in[13]);
-    RECON_AND_STORE(dest, in[14]);
-    RECON_AND_STORE(dest, in[15]);
-
-    dest += 8 - (stride * 16);
+    for (j = 0; j < 16; ++j) {
+      // Final rounding and shift
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
+    }
+
+    dest += 8;
   }
 }
 
@@ -3285,7 +3196,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
 // Only upper-left 8x8 has non-zero coeff
 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
-                                 int stride) {
+                               int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -3386,9 +3297,9 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
   LOAD_DQCOEFF(in[31], input);
 
   array_transpose_8x8(in, in);
-  array_transpose_8x8(in+8, in+8);
-  array_transpose_8x8(in+16, in+16);
-  array_transpose_8x8(in+24, in+24);
+  array_transpose_8x8(in + 8, in + 8);
+  array_transpose_8x8(in + 16, in + 16);
+  array_transpose_8x8(in + 24, in + 24);
 
   IDCT32
 
@@ -3426,153 +3337,61 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
   col[30] = _mm_sub_epi16(stp1_1, stp1_30);
   col[31] = _mm_sub_epi16(stp1_0, stp1_31);
   for (i = 0; i < 4; i++) {
-      const __m128i zero = _mm_setzero_si128();
-      // Transpose 32x8 block to 8x32 block
-      array_transpose_8x8(col+i*8, in);
-      IDCT32_34
-
-      // 2_D: Calculate the results and store them to destination.
-      in[0] = _mm_add_epi16(stp1_0, stp1_31);
-      in[1] = _mm_add_epi16(stp1_1, stp1_30);
-      in[2] = _mm_add_epi16(stp1_2, stp1_29);
-      in[3] = _mm_add_epi16(stp1_3, stp1_28);
-      in[4] = _mm_add_epi16(stp1_4, stp1_27);
-      in[5] = _mm_add_epi16(stp1_5, stp1_26);
-      in[6] = _mm_add_epi16(stp1_6, stp1_25);
-      in[7] = _mm_add_epi16(stp1_7, stp1_24);
-      in[8] = _mm_add_epi16(stp1_8, stp1_23);
-      in[9] = _mm_add_epi16(stp1_9, stp1_22);
-      in[10] = _mm_add_epi16(stp1_10, stp1_21);
-      in[11] = _mm_add_epi16(stp1_11, stp1_20);
-      in[12] = _mm_add_epi16(stp1_12, stp1_19);
-      in[13] = _mm_add_epi16(stp1_13, stp1_18);
-      in[14] = _mm_add_epi16(stp1_14, stp1_17);
-      in[15] = _mm_add_epi16(stp1_15, stp1_16);
-      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
+    int j;
+    const __m128i zero = _mm_setzero_si128();
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + i * 8, in);
+    IDCT32_34
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
       // Final rounding and shift
-      in[0] = _mm_adds_epi16(in[0], final_rounding);
-      in[1] = _mm_adds_epi16(in[1], final_rounding);
-      in[2] = _mm_adds_epi16(in[2], final_rounding);
-      in[3] = _mm_adds_epi16(in[3], final_rounding);
-      in[4] = _mm_adds_epi16(in[4], final_rounding);
-      in[5] = _mm_adds_epi16(in[5], final_rounding);
-      in[6] = _mm_adds_epi16(in[6], final_rounding);
-      in[7] = _mm_adds_epi16(in[7], final_rounding);
-      in[8] = _mm_adds_epi16(in[8], final_rounding);
-      in[9] = _mm_adds_epi16(in[9], final_rounding);
-      in[10] = _mm_adds_epi16(in[10], final_rounding);
-      in[11] = _mm_adds_epi16(in[11], final_rounding);
-      in[12] = _mm_adds_epi16(in[12], final_rounding);
-      in[13] = _mm_adds_epi16(in[13], final_rounding);
-      in[14] = _mm_adds_epi16(in[14], final_rounding);
-      in[15] = _mm_adds_epi16(in[15], final_rounding);
-      in[16] = _mm_adds_epi16(in[16], final_rounding);
-      in[17] = _mm_adds_epi16(in[17], final_rounding);
-      in[18] = _mm_adds_epi16(in[18], final_rounding);
-      in[19] = _mm_adds_epi16(in[19], final_rounding);
-      in[20] = _mm_adds_epi16(in[20], final_rounding);
-      in[21] = _mm_adds_epi16(in[21], final_rounding);
-      in[22] = _mm_adds_epi16(in[22], final_rounding);
-      in[23] = _mm_adds_epi16(in[23], final_rounding);
-      in[24] = _mm_adds_epi16(in[24], final_rounding);
-      in[25] = _mm_adds_epi16(in[25], final_rounding);
-      in[26] = _mm_adds_epi16(in[26], final_rounding);
-      in[27] = _mm_adds_epi16(in[27], final_rounding);
-      in[28] = _mm_adds_epi16(in[28], final_rounding);
-      in[29] = _mm_adds_epi16(in[29], final_rounding);
-      in[30] = _mm_adds_epi16(in[30], final_rounding);
-      in[31] = _mm_adds_epi16(in[31], final_rounding);
-
-      in[0] = _mm_srai_epi16(in[0], 6);
-      in[1] = _mm_srai_epi16(in[1], 6);
-      in[2] = _mm_srai_epi16(in[2], 6);
-      in[3] = _mm_srai_epi16(in[3], 6);
-      in[4] = _mm_srai_epi16(in[4], 6);
-      in[5] = _mm_srai_epi16(in[5], 6);
-      in[6] = _mm_srai_epi16(in[6], 6);
-      in[7] = _mm_srai_epi16(in[7], 6);
-      in[8] = _mm_srai_epi16(in[8], 6);
-      in[9] = _mm_srai_epi16(in[9], 6);
-      in[10] = _mm_srai_epi16(in[10], 6);
-      in[11] = _mm_srai_epi16(in[11], 6);
-      in[12] = _mm_srai_epi16(in[12], 6);
-      in[13] = _mm_srai_epi16(in[13], 6);
-      in[14] = _mm_srai_epi16(in[14], 6);
-      in[15] = _mm_srai_epi16(in[15], 6);
-      in[16] = _mm_srai_epi16(in[16], 6);
-      in[17] = _mm_srai_epi16(in[17], 6);
-      in[18] = _mm_srai_epi16(in[18], 6);
-      in[19] = _mm_srai_epi16(in[19], 6);
-      in[20] = _mm_srai_epi16(in[20], 6);
-      in[21] = _mm_srai_epi16(in[21], 6);
-      in[22] = _mm_srai_epi16(in[22], 6);
-      in[23] = _mm_srai_epi16(in[23], 6);
-      in[24] = _mm_srai_epi16(in[24], 6);
-      in[25] = _mm_srai_epi16(in[25], 6);
-      in[26] = _mm_srai_epi16(in[26], 6);
-      in[27] = _mm_srai_epi16(in[27], 6);
-      in[28] = _mm_srai_epi16(in[28], 6);
-      in[29] = _mm_srai_epi16(in[29], 6);
-      in[30] = _mm_srai_epi16(in[30], 6);
-      in[31] = _mm_srai_epi16(in[31], 6);
-
-      RECON_AND_STORE(dest, in[0]);
-      RECON_AND_STORE(dest, in[1]);
-      RECON_AND_STORE(dest, in[2]);
-      RECON_AND_STORE(dest, in[3]);
-      RECON_AND_STORE(dest, in[4]);
-      RECON_AND_STORE(dest, in[5]);
-      RECON_AND_STORE(dest, in[6]);
-      RECON_AND_STORE(dest, in[7]);
-      RECON_AND_STORE(dest, in[8]);
-      RECON_AND_STORE(dest, in[9]);
-      RECON_AND_STORE(dest, in[10]);
-      RECON_AND_STORE(dest, in[11]);
-      RECON_AND_STORE(dest, in[12]);
-      RECON_AND_STORE(dest, in[13]);
-      RECON_AND_STORE(dest, in[14]);
-      RECON_AND_STORE(dest, in[15]);
-      RECON_AND_STORE(dest, in[16]);
-      RECON_AND_STORE(dest, in[17]);
-      RECON_AND_STORE(dest, in[18]);
-      RECON_AND_STORE(dest, in[19]);
-      RECON_AND_STORE(dest, in[20]);
-      RECON_AND_STORE(dest, in[21]);
-      RECON_AND_STORE(dest, in[22]);
-      RECON_AND_STORE(dest, in[23]);
-      RECON_AND_STORE(dest, in[24]);
-      RECON_AND_STORE(dest, in[25]);
-      RECON_AND_STORE(dest, in[26]);
-      RECON_AND_STORE(dest, in[27]);
-      RECON_AND_STORE(dest, in[28]);
-      RECON_AND_STORE(dest, in[29]);
-      RECON_AND_STORE(dest, in[30]);
-      RECON_AND_STORE(dest, in[31]);
-
-      dest += 8 - (stride * 32);
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
     }
+
+    dest += 8;
   }
+}
 
 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                                  int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
   const __m128i zero = _mm_setzero_si128();
 
   // idct constants for each stage
@@ -3639,304 +3458,211 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
 
   for (i = 0; i < 4; i++) {
     i32 = (i << 5);
-      // First 1-D idct
-      // Load input data.
-      LOAD_DQCOEFF(in[0], input);
-      LOAD_DQCOEFF(in[8], input);
-      LOAD_DQCOEFF(in[16], input);
-      LOAD_DQCOEFF(in[24], input);
-      LOAD_DQCOEFF(in[1], input);
-      LOAD_DQCOEFF(in[9], input);
-      LOAD_DQCOEFF(in[17], input);
-      LOAD_DQCOEFF(in[25], input);
-      LOAD_DQCOEFF(in[2], input);
-      LOAD_DQCOEFF(in[10], input);
-      LOAD_DQCOEFF(in[18], input);
-      LOAD_DQCOEFF(in[26], input);
-      LOAD_DQCOEFF(in[3], input);
-      LOAD_DQCOEFF(in[11], input);
-      LOAD_DQCOEFF(in[19], input);
-      LOAD_DQCOEFF(in[27], input);
-
-      LOAD_DQCOEFF(in[4], input);
-      LOAD_DQCOEFF(in[12], input);
-      LOAD_DQCOEFF(in[20], input);
-      LOAD_DQCOEFF(in[28], input);
-      LOAD_DQCOEFF(in[5], input);
-      LOAD_DQCOEFF(in[13], input);
-      LOAD_DQCOEFF(in[21], input);
-      LOAD_DQCOEFF(in[29], input);
-      LOAD_DQCOEFF(in[6], input);
-      LOAD_DQCOEFF(in[14], input);
-      LOAD_DQCOEFF(in[22], input);
-      LOAD_DQCOEFF(in[30], input);
-      LOAD_DQCOEFF(in[7], input);
-      LOAD_DQCOEFF(in[15], input);
-      LOAD_DQCOEFF(in[23], input);
-      LOAD_DQCOEFF(in[31], input);
-
-      // checking if all entries are zero
-      zero_idx[0] = _mm_or_si128(in[0], in[1]);
-      zero_idx[1] = _mm_or_si128(in[2], in[3]);
-      zero_idx[2] = _mm_or_si128(in[4], in[5]);
-      zero_idx[3] = _mm_or_si128(in[6], in[7]);
-      zero_idx[4] = _mm_or_si128(in[8], in[9]);
-      zero_idx[5] = _mm_or_si128(in[10], in[11]);
-      zero_idx[6] = _mm_or_si128(in[12], in[13]);
-      zero_idx[7] = _mm_or_si128(in[14], in[15]);
-      zero_idx[8] = _mm_or_si128(in[16], in[17]);
-      zero_idx[9] = _mm_or_si128(in[18], in[19]);
-      zero_idx[10] = _mm_or_si128(in[20], in[21]);
-      zero_idx[11] = _mm_or_si128(in[22], in[23]);
-      zero_idx[12] = _mm_or_si128(in[24], in[25]);
-      zero_idx[13] = _mm_or_si128(in[26], in[27]);
-      zero_idx[14] = _mm_or_si128(in[28], in[29]);
-      zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-      if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-        col[i32 + 0] = _mm_setzero_si128();
-        col[i32 + 1] = _mm_setzero_si128();
-        col[i32 + 2] = _mm_setzero_si128();
-        col[i32 + 3] = _mm_setzero_si128();
-        col[i32 + 4] = _mm_setzero_si128();
-        col[i32 + 5] = _mm_setzero_si128();
-        col[i32 + 6] = _mm_setzero_si128();
-        col[i32 + 7] = _mm_setzero_si128();
-        col[i32 + 8] = _mm_setzero_si128();
-        col[i32 + 9] = _mm_setzero_si128();
-        col[i32 + 10] = _mm_setzero_si128();
-        col[i32 + 11] = _mm_setzero_si128();
-        col[i32 + 12] = _mm_setzero_si128();
-        col[i32 + 13] = _mm_setzero_si128();
-        col[i32 + 14] = _mm_setzero_si128();
-        col[i32 + 15] = _mm_setzero_si128();
-        col[i32 + 16] = _mm_setzero_si128();
-        col[i32 + 17] = _mm_setzero_si128();
-        col[i32 + 18] = _mm_setzero_si128();
-        col[i32 + 19] = _mm_setzero_si128();
-        col[i32 + 20] = _mm_setzero_si128();
-        col[i32 + 21] = _mm_setzero_si128();
-        col[i32 + 22] = _mm_setzero_si128();
-        col[i32 + 23] = _mm_setzero_si128();
-        col[i32 + 24] = _mm_setzero_si128();
-        col[i32 + 25] = _mm_setzero_si128();
-        col[i32 + 26] = _mm_setzero_si128();
-        col[i32 + 27] = _mm_setzero_si128();
-        col[i32 + 28] = _mm_setzero_si128();
-        col[i32 + 29] = _mm_setzero_si128();
-        col[i32 + 30] = _mm_setzero_si128();
-        col[i32 + 31] = _mm_setzero_si128();
-        continue;
-      }
-
-      // Transpose 32x8 block to 8x32 block
-      array_transpose_8x8(in, in);
-      array_transpose_8x8(in+8, in+8);
-      array_transpose_8x8(in+16, in+16);
-      array_transpose_8x8(in+24, in+24);
-
-      IDCT32
-
-      // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    // First 1-D idct
+    // Load input data.
+    LOAD_DQCOEFF(in[0], input);
+    LOAD_DQCOEFF(in[8], input);
+    LOAD_DQCOEFF(in[16], input);
+    LOAD_DQCOEFF(in[24], input);
+    LOAD_DQCOEFF(in[1], input);
+    LOAD_DQCOEFF(in[9], input);
+    LOAD_DQCOEFF(in[17], input);
+    LOAD_DQCOEFF(in[25], input);
+    LOAD_DQCOEFF(in[2], input);
+    LOAD_DQCOEFF(in[10], input);
+    LOAD_DQCOEFF(in[18], input);
+    LOAD_DQCOEFF(in[26], input);
+    LOAD_DQCOEFF(in[3], input);
+    LOAD_DQCOEFF(in[11], input);
+    LOAD_DQCOEFF(in[19], input);
+    LOAD_DQCOEFF(in[27], input);
+
+    LOAD_DQCOEFF(in[4], input);
+    LOAD_DQCOEFF(in[12], input);
+    LOAD_DQCOEFF(in[20], input);
+    LOAD_DQCOEFF(in[28], input);
+    LOAD_DQCOEFF(in[5], input);
+    LOAD_DQCOEFF(in[13], input);
+    LOAD_DQCOEFF(in[21], input);
+    LOAD_DQCOEFF(in[29], input);
+    LOAD_DQCOEFF(in[6], input);
+    LOAD_DQCOEFF(in[14], input);
+    LOAD_DQCOEFF(in[22], input);
+    LOAD_DQCOEFF(in[30], input);
+    LOAD_DQCOEFF(in[7], input);
+    LOAD_DQCOEFF(in[15], input);
+    LOAD_DQCOEFF(in[23], input);
+    LOAD_DQCOEFF(in[31], input);
+
+    // checking if all entries are zero
+    zero_idx[0] = _mm_or_si128(in[0], in[1]);
+    zero_idx[1] = _mm_or_si128(in[2], in[3]);
+    zero_idx[2] = _mm_or_si128(in[4], in[5]);
+    zero_idx[3] = _mm_or_si128(in[6], in[7]);
+    zero_idx[4] = _mm_or_si128(in[8], in[9]);
+    zero_idx[5] = _mm_or_si128(in[10], in[11]);
+    zero_idx[6] = _mm_or_si128(in[12], in[13]);
+    zero_idx[7] = _mm_or_si128(in[14], in[15]);
+    zero_idx[8] = _mm_or_si128(in[16], in[17]);
+    zero_idx[9] = _mm_or_si128(in[18], in[19]);
+    zero_idx[10] = _mm_or_si128(in[20], in[21]);
+    zero_idx[11] = _mm_or_si128(in[22], in[23]);
+    zero_idx[12] = _mm_or_si128(in[24], in[25]);
+    zero_idx[13] = _mm_or_si128(in[26], in[27]);
+    zero_idx[14] = _mm_or_si128(in[28], in[29]);
+    zero_idx[15] = _mm_or_si128(in[30], in[31]);
+
+    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
+      col[i32 + 0] = _mm_setzero_si128();
+      col[i32 + 1] = _mm_setzero_si128();
+      col[i32 + 2] = _mm_setzero_si128();
+      col[i32 + 3] = _mm_setzero_si128();
+      col[i32 + 4] = _mm_setzero_si128();
+      col[i32 + 5] = _mm_setzero_si128();
+      col[i32 + 6] = _mm_setzero_si128();
+      col[i32 + 7] = _mm_setzero_si128();
+      col[i32 + 8] = _mm_setzero_si128();
+      col[i32 + 9] = _mm_setzero_si128();
+      col[i32 + 10] = _mm_setzero_si128();
+      col[i32 + 11] = _mm_setzero_si128();
+      col[i32 + 12] = _mm_setzero_si128();
+      col[i32 + 13] = _mm_setzero_si128();
+      col[i32 + 14] = _mm_setzero_si128();
+      col[i32 + 15] = _mm_setzero_si128();
+      col[i32 + 16] = _mm_setzero_si128();
+      col[i32 + 17] = _mm_setzero_si128();
+      col[i32 + 18] = _mm_setzero_si128();
+      col[i32 + 19] = _mm_setzero_si128();
+      col[i32 + 20] = _mm_setzero_si128();
+      col[i32 + 21] = _mm_setzero_si128();
+      col[i32 + 22] = _mm_setzero_si128();
+      col[i32 + 23] = _mm_setzero_si128();
+      col[i32 + 24] = _mm_setzero_si128();
+      col[i32 + 25] = _mm_setzero_si128();
+      col[i32 + 26] = _mm_setzero_si128();
+      col[i32 + 27] = _mm_setzero_si128();
+      col[i32 + 28] = _mm_setzero_si128();
+      col[i32 + 29] = _mm_setzero_si128();
+      col[i32 + 30] = _mm_setzero_si128();
+      col[i32 + 31] = _mm_setzero_si128();
+      continue;
     }
-  for (i = 0; i < 4; i++) {
-      // Second 1-D idct
-      j = i << 3;
-
-      // Transpose 32x8 block to 8x32 block
-      array_transpose_8x8(col+j, in);
-      array_transpose_8x8(col+j+32, in+8);
-      array_transpose_8x8(col+j+64, in+16);
-      array_transpose_8x8(col+j+96, in+24);
-
-      IDCT32
-
-      // 2_D: Calculate the results and store them to destination.
-      in[0] = _mm_add_epi16(stp1_0, stp1_31);
-      in[1] = _mm_add_epi16(stp1_1, stp1_30);
-      in[2] = _mm_add_epi16(stp1_2, stp1_29);
-      in[3] = _mm_add_epi16(stp1_3, stp1_28);
-      in[4] = _mm_add_epi16(stp1_4, stp1_27);
-      in[5] = _mm_add_epi16(stp1_5, stp1_26);
-      in[6] = _mm_add_epi16(stp1_6, stp1_25);
-      in[7] = _mm_add_epi16(stp1_7, stp1_24);
-      in[8] = _mm_add_epi16(stp1_8, stp1_23);
-      in[9] = _mm_add_epi16(stp1_9, stp1_22);
-      in[10] = _mm_add_epi16(stp1_10, stp1_21);
-      in[11] = _mm_add_epi16(stp1_11, stp1_20);
-      in[12] = _mm_add_epi16(stp1_12, stp1_19);
-      in[13] = _mm_add_epi16(stp1_13, stp1_18);
-      in[14] = _mm_add_epi16(stp1_14, stp1_17);
-      in[15] = _mm_add_epi16(stp1_15, stp1_16);
-      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(in, in);
+    array_transpose_8x8(in + 8, in + 8);
+    array_transpose_8x8(in + 16, in + 16);
+    array_transpose_8x8(in + 24, in + 24);
+
+    IDCT32
+
+    // 1_D: Store 32 intermediate results for each 8x32 block.
+    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+  }
+  for (i = 0; i < 4; i++) {
+    // Second 1-D idct
+    j = i << 3;
+
+    // Transpose 32x8 block to 8x32 block
+    array_transpose_8x8(col + j, in);
+    array_transpose_8x8(col + j + 32, in + 8);
+    array_transpose_8x8(col + j + 64, in + 16);
+    array_transpose_8x8(col + j + 96, in + 24);
+
+    IDCT32
+
+    // 2_D: Calculate the results and store them to destination.
+    in[0] = _mm_add_epi16(stp1_0, stp1_31);
+    in[1] = _mm_add_epi16(stp1_1, stp1_30);
+    in[2] = _mm_add_epi16(stp1_2, stp1_29);
+    in[3] = _mm_add_epi16(stp1_3, stp1_28);
+    in[4] = _mm_add_epi16(stp1_4, stp1_27);
+    in[5] = _mm_add_epi16(stp1_5, stp1_26);
+    in[6] = _mm_add_epi16(stp1_6, stp1_25);
+    in[7] = _mm_add_epi16(stp1_7, stp1_24);
+    in[8] = _mm_add_epi16(stp1_8, stp1_23);
+    in[9] = _mm_add_epi16(stp1_9, stp1_22);
+    in[10] = _mm_add_epi16(stp1_10, stp1_21);
+    in[11] = _mm_add_epi16(stp1_11, stp1_20);
+    in[12] = _mm_add_epi16(stp1_12, stp1_19);
+    in[13] = _mm_add_epi16(stp1_13, stp1_18);
+    in[14] = _mm_add_epi16(stp1_14, stp1_17);
+    in[15] = _mm_add_epi16(stp1_15, stp1_16);
+    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+
+    for (j = 0; j < 32; ++j) {
       // Final rounding and shift
-      in[0] = _mm_adds_epi16(in[0], final_rounding);
-      in[1] = _mm_adds_epi16(in[1], final_rounding);
-      in[2] = _mm_adds_epi16(in[2], final_rounding);
-      in[3] = _mm_adds_epi16(in[3], final_rounding);
-      in[4] = _mm_adds_epi16(in[4], final_rounding);
-      in[5] = _mm_adds_epi16(in[5], final_rounding);
-      in[6] = _mm_adds_epi16(in[6], final_rounding);
-      in[7] = _mm_adds_epi16(in[7], final_rounding);
-      in[8] = _mm_adds_epi16(in[8], final_rounding);
-      in[9] = _mm_adds_epi16(in[9], final_rounding);
-      in[10] = _mm_adds_epi16(in[10], final_rounding);
-      in[11] = _mm_adds_epi16(in[11], final_rounding);
-      in[12] = _mm_adds_epi16(in[12], final_rounding);
-      in[13] = _mm_adds_epi16(in[13], final_rounding);
-      in[14] = _mm_adds_epi16(in[14], final_rounding);
-      in[15] = _mm_adds_epi16(in[15], final_rounding);
-      in[16] = _mm_adds_epi16(in[16], final_rounding);
-      in[17] = _mm_adds_epi16(in[17], final_rounding);
-      in[18] = _mm_adds_epi16(in[18], final_rounding);
-      in[19] = _mm_adds_epi16(in[19], final_rounding);
-      in[20] = _mm_adds_epi16(in[20], final_rounding);
-      in[21] = _mm_adds_epi16(in[21], final_rounding);
-      in[22] = _mm_adds_epi16(in[22], final_rounding);
-      in[23] = _mm_adds_epi16(in[23], final_rounding);
-      in[24] = _mm_adds_epi16(in[24], final_rounding);
-      in[25] = _mm_adds_epi16(in[25], final_rounding);
-      in[26] = _mm_adds_epi16(in[26], final_rounding);
-      in[27] = _mm_adds_epi16(in[27], final_rounding);
-      in[28] = _mm_adds_epi16(in[28], final_rounding);
-      in[29] = _mm_adds_epi16(in[29], final_rounding);
-      in[30] = _mm_adds_epi16(in[30], final_rounding);
-      in[31] = _mm_adds_epi16(in[31], final_rounding);
-
-      in[0] = _mm_srai_epi16(in[0], 6);
-      in[1] = _mm_srai_epi16(in[1], 6);
-      in[2] = _mm_srai_epi16(in[2], 6);
-      in[3] = _mm_srai_epi16(in[3], 6);
-      in[4] = _mm_srai_epi16(in[4], 6);
-      in[5] = _mm_srai_epi16(in[5], 6);
-      in[6] = _mm_srai_epi16(in[6], 6);
-      in[7] = _mm_srai_epi16(in[7], 6);
-      in[8] = _mm_srai_epi16(in[8], 6);
-      in[9] = _mm_srai_epi16(in[9], 6);
-      in[10] = _mm_srai_epi16(in[10], 6);
-      in[11] = _mm_srai_epi16(in[11], 6);
-      in[12] = _mm_srai_epi16(in[12], 6);
-      in[13] = _mm_srai_epi16(in[13], 6);
-      in[14] = _mm_srai_epi16(in[14], 6);
-      in[15] = _mm_srai_epi16(in[15], 6);
-      in[16] = _mm_srai_epi16(in[16], 6);
-      in[17] = _mm_srai_epi16(in[17], 6);
-      in[18] = _mm_srai_epi16(in[18], 6);
-      in[19] = _mm_srai_epi16(in[19], 6);
-      in[20] = _mm_srai_epi16(in[20], 6);
-      in[21] = _mm_srai_epi16(in[21], 6);
-      in[22] = _mm_srai_epi16(in[22], 6);
-      in[23] = _mm_srai_epi16(in[23], 6);
-      in[24] = _mm_srai_epi16(in[24], 6);
-      in[25] = _mm_srai_epi16(in[25], 6);
-      in[26] = _mm_srai_epi16(in[26], 6);
-      in[27] = _mm_srai_epi16(in[27], 6);
-      in[28] = _mm_srai_epi16(in[28], 6);
-      in[29] = _mm_srai_epi16(in[29], 6);
-      in[30] = _mm_srai_epi16(in[30], 6);
-      in[31] = _mm_srai_epi16(in[31], 6);
-
-      RECON_AND_STORE(dest, in[0]);
-      RECON_AND_STORE(dest, in[1]);
-      RECON_AND_STORE(dest, in[2]);
-      RECON_AND_STORE(dest, in[3]);
-      RECON_AND_STORE(dest, in[4]);
-      RECON_AND_STORE(dest, in[5]);
-      RECON_AND_STORE(dest, in[6]);
-      RECON_AND_STORE(dest, in[7]);
-      RECON_AND_STORE(dest, in[8]);
-      RECON_AND_STORE(dest, in[9]);
-      RECON_AND_STORE(dest, in[10]);
-      RECON_AND_STORE(dest, in[11]);
-      RECON_AND_STORE(dest, in[12]);
-      RECON_AND_STORE(dest, in[13]);
-      RECON_AND_STORE(dest, in[14]);
-      RECON_AND_STORE(dest, in[15]);
-      RECON_AND_STORE(dest, in[16]);
-      RECON_AND_STORE(dest, in[17]);
-      RECON_AND_STORE(dest, in[18]);
-      RECON_AND_STORE(dest, in[19]);
-      RECON_AND_STORE(dest, in[20]);
-      RECON_AND_STORE(dest, in[21]);
-      RECON_AND_STORE(dest, in[22]);
-      RECON_AND_STORE(dest, in[23]);
-      RECON_AND_STORE(dest, in[24]);
-      RECON_AND_STORE(dest, in[25]);
-      RECON_AND_STORE(dest, in[26]);
-      RECON_AND_STORE(dest, in[27]);
-      RECON_AND_STORE(dest, in[28]);
-      RECON_AND_STORE(dest, in[29]);
-      RECON_AND_STORE(dest, in[30]);
-      RECON_AND_STORE(dest, in[31]);
-
-      dest += 8 - (stride * 32);
+      in[j] = _mm_adds_epi16(in[j], final_rounding);
+      in[j] = _mm_srai_epi16(in[j], 6);
+      RECON_AND_STORE(dest + j * stride, in[j]);
     }
-}  //NOLINT
+
+    dest += 8;
+  }
+}
 
 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i dc_value;
@@ -3950,38 +3676,580 @@ void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   dc_value = _mm_set1_epi16(a);
 
   for (i = 0; i < 4; ++i) {
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    RECON_AND_STORE(dest, dc_value);
-    dest += 8 - (stride * 32);
+    int j;
+    for (j = 0; j < 32; ++j) {
+      RECON_AND_STORE(dest + j * stride, dc_value);
+    }
+    dest += 8;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
+  __m128i ubounded, retval;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one);
+  ubounded = _mm_cmpgt_epi16(value, max);
+  retval = _mm_andnot_si128(ubounded, value);
+  ubounded = _mm_and_si128(ubounded, max);
+  retval = _mm_or_si128(retval, ubounded);
+  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
+  return retval;
+}
+
+void vp9_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  __m128i inptr[4];
+  __m128i sign_bits[2];
+  __m128i temp_mm, min_input, max_input;
+  int test;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  int optimised_cols = 0;
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i eight = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(12043);
+  const __m128i min = _mm_set1_epi16(-12043);
+  // Load input into __m128i
+  inptr[0] = _mm_loadu_si128((const __m128i *)input);
+  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
+  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
+  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
+
+  // Pack to 16 bits
+  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
+  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
+
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp_mm = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp_mm);
+
+  if (!test) {
+    // Do the row transform
+    idct4_sse2(inptr);
+
+    // Check the min & max values
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp_mm = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp_mm);
+
+    if (test) {
+      transpose_4x4(inptr);
+      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
+      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
+      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
+      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
+      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
+      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
+      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
+      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
+      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
+      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct4(input, outptr, bd);
+      input += 4;
+      outptr += 4;
+    }
+  }
+
+  if (optimised_cols) {
+    idct4_sse2(inptr);
+
+    // Final round and shift
+    inptr[0] = _mm_add_epi16(inptr[0], eight);
+    inptr[1] = _mm_add_epi16(inptr[1], eight);
+
+    inptr[0] = _mm_srai_epi16(inptr[0], 4);
+    inptr[1] = _mm_srai_epi16(inptr[1], 4);
+
+    // Reconstruction and Store
+    {
+      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+      d0 = _mm_unpacklo_epi64(
+          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+      d2 = _mm_unpacklo_epi64(
+          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
+      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
+      // store input0
+      _mm_storel_epi64((__m128i *)dest, d0);
+      // store input1
+      d0 = _mm_srli_si128(d0, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride), d0);
+      // store input2
+      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+      // store input3
+      d2 = _mm_srli_si128(d2, 8);
+      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[4], temp_out[4];
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j * 4 + i];
+      vp9_highbd_idct4(temp_in, temp_out, bd);
+      for (j = 0; j < 4; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 8; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_8x8(inptr, inptr);
+      for (i = 0; i < 8; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 8; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                    int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[8];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  const __m128i max = _mm_set1_epi16(6201);
+  const __m128i min = _mm_set1_epi16(-6201);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 8; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // only first 4 row has non-zero coefs
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct8_sse2(inptr);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 8; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_4X8(inptr, inptr);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct8(input, outptr, bd);
+      input += 8;
+      outptr += 8;
+    }
+  }
+
+  if (optimised_cols) {
+    idct8_sse2(inptr);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[8];
+      for (i = 0; i < 8; i++) {
+        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
+        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        inptr[i] = _mm_srai_epi16(inptr[i], 5);
+        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[i]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[8], temp_out[8];
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j * 8 + i];
+      vp9_highbd_idct8(temp_in, temp_out, bd);
+      for (j = 0; j < 8; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      }
+    }
+  }
+}
+
+void vp9_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                       int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 32; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 32; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      array_transpose_16x16(inptr, inptr + 16);
+      for (i = 0; i < 16; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 16; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
   }
 }
+
+void vp9_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest8,
+                                      int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j, test;
+  __m128i inptr[32];
+  __m128i min_input, max_input, temp1, temp2, sign_bits;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i rounding = _mm_set1_epi16(32);
+  const __m128i max = _mm_set1_epi16(3155);
+  const __m128i min = _mm_set1_epi16(-3155);
+  int optimised_cols = 0;
+
+  // Load input into __m128i & pack to 16 bits
+  for (i = 0; i < 16; i++) {
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
+    inptr[i] = _mm_packs_epi32(temp1, temp2);
+    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
+    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
+    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
+  }
+
+  // Find the min & max for the row transform
+  // Since all non-zero dct coefficients are in upper-left 4x4 area,
+  // we only need to consider first 4 rows here.
+  max_input = _mm_max_epi16(inptr[0], inptr[1]);
+  min_input = _mm_min_epi16(inptr[0], inptr[1]);
+  for (i = 2; i < 4; i++) {
+    max_input = _mm_max_epi16(max_input, inptr[i]);
+    min_input = _mm_min_epi16(min_input, inptr[i]);
+  }
+  max_input = _mm_cmpgt_epi16(max_input, max);
+  min_input = _mm_cmplt_epi16(min_input, min);
+  temp1 = _mm_or_si128(max_input, min_input);
+  test = _mm_movemask_epi8(temp1);
+
+  if (!test) {
+    // Do the row transform (N.B. This transposes inptr)
+    idct16_sse2(inptr, inptr + 16);
+
+    // Find the min & max for the column transform
+    // N.B. Only first 4 cols contain non-zero coeffs
+    max_input = _mm_max_epi16(inptr[0], inptr[1]);
+    min_input = _mm_min_epi16(inptr[0], inptr[1]);
+    for (i = 2; i < 16; i++) {
+      max_input = _mm_max_epi16(max_input, inptr[i]);
+      min_input = _mm_min_epi16(min_input, inptr[i]);
+    }
+    max_input = _mm_cmpgt_epi16(max_input, max);
+    min_input = _mm_cmplt_epi16(min_input, min);
+    temp1 = _mm_or_si128(max_input, min_input);
+    test = _mm_movemask_epi8(temp1);
+
+    if (test) {
+      // Use fact only first 4 rows contain non-zero coeffs
+      array_transpose_8x8(inptr, inptr);
+      array_transpose_8x8(inptr + 8, inptr + 16);
+      for (i = 0; i < 4; i++) {
+        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
+        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
+        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
+        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
+        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+      }
+    } else {
+      // Set to use the optimised transform for the column
+      optimised_cols = 1;
+    }
+  } else {
+    // Run the un-optimised row transform
+    for (i = 0; i < 4; ++i) {
+      vp9_highbd_idct16(input, outptr, bd);
+      input += 16;
+      outptr += 16;
+    }
+  }
+
+  if (optimised_cols) {
+    idct16_sse2(inptr, inptr + 16);
+
+    // Final round & shift and Reconstruction and Store
+    {
+      __m128i d[2];
+      for (i = 0; i < 16; i++) {
+        inptr[i   ] = _mm_add_epi16(inptr[i   ], rounding);
+        inptr[i+16] = _mm_add_epi16(inptr[i+16], rounding);
+        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride*i));
+        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride*i + 8));
+        inptr[i   ] = _mm_srai_epi16(inptr[i   ], 6);
+        inptr[i+16] = _mm_srai_epi16(inptr[i+16], 6);
+        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i   ]), bd);
+        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i+16]), bd);
+        // Store
+        _mm_storeu_si128((__m128i *)(dest + stride*i), d[0]);
+        _mm_storeu_si128((__m128i *)(dest + stride*i + 8), d[1]);
+      }
+    }
+  } else {
+    // Run the un-optimised column transform
+    tran_low_t temp_in[16], temp_out[16];
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j * 16 + i];
+      vp9_highbd_idct16(temp_in, temp_out, bd);
+      for (j = 0; j < 16; ++j) {
+        dest[j * stride + i] = highbd_clip_pixel_add(
+            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      }
+    }
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
index 0f179b49a57..984363d4035 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.h
@@ -115,7 +115,6 @@ static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
       d0 = _mm_add_epi16(in_x, d0); \
       d0 = _mm_packus_epi16(d0, d0); \
       _mm_storel_epi64((__m128i *)(dest), d0); \
-      dest += stride; \
   }
 
 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
@@ -156,20 +155,20 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
   in[14] = _mm_srai_epi16(in[14], 6);
   in[15] = _mm_srai_epi16(in[15], 6);
 
-  RECON_AND_STORE(dest, in[0]);
-  RECON_AND_STORE(dest, in[1]);
-  RECON_AND_STORE(dest, in[2]);
-  RECON_AND_STORE(dest, in[3]);
-  RECON_AND_STORE(dest, in[4]);
-  RECON_AND_STORE(dest, in[5]);
-  RECON_AND_STORE(dest, in[6]);
-  RECON_AND_STORE(dest, in[7]);
-  RECON_AND_STORE(dest, in[8]);
-  RECON_AND_STORE(dest, in[9]);
-  RECON_AND_STORE(dest, in[10]);
-  RECON_AND_STORE(dest, in[11]);
-  RECON_AND_STORE(dest, in[12]);
-  RECON_AND_STORE(dest, in[13]);
-  RECON_AND_STORE(dest, in[14]);
-  RECON_AND_STORE(dest, in[15]);
+  RECON_AND_STORE(dest +  0 * stride, in[0]);
+  RECON_AND_STORE(dest +  1 * stride, in[1]);
+  RECON_AND_STORE(dest +  2 * stride, in[2]);
+  RECON_AND_STORE(dest +  3 * stride, in[3]);
+  RECON_AND_STORE(dest +  4 * stride, in[4]);
+  RECON_AND_STORE(dest +  5 * stride, in[5]);
+  RECON_AND_STORE(dest +  6 * stride, in[6]);
+  RECON_AND_STORE(dest +  7 * stride, in[7]);
+  RECON_AND_STORE(dest +  8 * stride, in[8]);
+  RECON_AND_STORE(dest +  9 * stride, in[9]);
+  RECON_AND_STORE(dest + 10 * stride, in[10]);
+  RECON_AND_STORE(dest + 11 * stride, in[11]);
+  RECON_AND_STORE(dest + 12 * stride, in[12]);
+  RECON_AND_STORE(dest + 13 * stride, in[13]);
+  RECON_AND_STORE(dest + 14 * stride, in[14]);
+  RECON_AND_STORE(dest + 15 * stride, in[15]);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c
deleted file mode 100644
index 73bf5d1d78e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ /dev/null
@@ -1,762 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
-#include <tmmintrin.h>  // SSSE3
-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-
-static void idct16_8col(__m128i *in, int round) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i v[16], u[16], s[16], t[16];
-
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
-
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8]  = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9]  = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
-
-  // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
-
-  // stage 4
-  u[0] = _mm_add_epi16(t[0], t[1]);
-  u[1] = _mm_sub_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  s[0] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-  s[1] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9]  = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
-
-  // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_sub_epi16(s[6], s[5]);
-  u[1] = _mm_add_epi16(s[6], s[5]);
-  t[5] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-  t[6] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
-
-  // stage 6
-  if (round == 1) {
-    s[0] = _mm_add_epi16(t[0], t[7]);
-    s[1] = _mm_add_epi16(t[1], t[6]);
-    s[2] = _mm_add_epi16(t[2], t[5]);
-    s[3] = _mm_add_epi16(t[3], t[4]);
-    s[4] = _mm_sub_epi16(t[3], t[4]);
-    s[5] = _mm_sub_epi16(t[2], t[5]);
-    s[6] = _mm_sub_epi16(t[1], t[6]);
-    s[7] = _mm_sub_epi16(t[0], t[7]);
-    s[8] = t[8];
-    s[9] = t[9];
-
-    u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-    u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-    u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-    u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-    v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-    v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-    v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-    v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-    v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-    v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-    v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-    v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-    u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-    u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-    u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-    u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-    u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-    u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-    u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-    u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-    u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-    u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-    u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-    u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-    u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-    u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-    u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-    u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-    s[10] = _mm_packs_epi32(u[0], u[1]);
-    s[13] = _mm_packs_epi32(u[2], u[3]);
-    s[11] = _mm_packs_epi32(u[4], u[5]);
-    s[12] = _mm_packs_epi32(u[6], u[7]);
-    s[14] = t[14];
-    s[15] = t[15];
-  } else {
-    s[0] = _mm_add_epi16(t[0], t[7]);
-    s[1] = _mm_add_epi16(t[1], t[6]);
-    s[2] = _mm_add_epi16(t[2], t[5]);
-    s[3] = _mm_add_epi16(t[3], t[4]);
-    s[4] = _mm_sub_epi16(t[3], t[4]);
-    s[5] = _mm_sub_epi16(t[2], t[5]);
-    s[6] = _mm_sub_epi16(t[1], t[6]);
-    s[7] = _mm_sub_epi16(t[0], t[7]);
-    s[8] = t[8];
-    s[9] = t[9];
-
-    u[0] = _mm_sub_epi16(t[13], t[10]);
-    u[1] = _mm_add_epi16(t[13], t[10]);
-    u[2] = _mm_sub_epi16(t[12], t[11]);
-    u[3] = _mm_add_epi16(t[12], t[11]);
-
-    s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
-    s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
-    s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
-    s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
-    s[14] = t[14];
-    s[15] = t[15];
-  }
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
-static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0, round);
-  idct16_8col(in1, round);
-}
-
-void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
-                                int stride) {
-  __m128i in0[16], in1[16];
-
-  load_buffer_8x16(input, in0);
-  input += 8;
-  load_buffer_8x16(input, in1);
-
-  idct16_sse2(in0, in1, 0);
-  idct16_sse2(in0, in1, 1);
-
-  write_buffer_8x16(dest, in0, stride);
-  dest += 8;
-  write_buffer_8x16(dest, in1, stride);
-}
-
-static void idct16_10_r1(__m128i *in, __m128i *l) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_01 = dual_set_epi16(3212, 32610);
-  const __m128i stg2_67 = dual_set_epi16(-9512, 31358);
-  const __m128i stg3_01 = dual_set_epi16(6392, 32138);
-  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
-
-
-
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  __m128i stp1_0, stp1_1, stp1_4, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4;
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi64(in[0], in[0]);
-    const __m128i lo_13_3 = _mm_unpackhi_epi64(in[1], in[1]);
-
-    stp2_8  = _mm_mulhrs_epi16(lo_1_15, stg2_01);
-    stp2_11 = _mm_mulhrs_epi16(lo_13_3, stg2_67);
-  }
-
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi64(in[1], in[1]);
-    stp1_4 = _mm_mulhrs_epi16(lo_2_14, stg3_01);
-
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-  }
-
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi64(in[0], in[0]);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_mulhrs_epi16(lo_0_8, stg4_01);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp4 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-
-    stp1_0 = _mm_unpacklo_epi64(tmp0, tmp0);
-    stp1_1 = _mm_unpackhi_epi64(tmp0, tmp0);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp2, tmp4);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
-
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9  = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8  = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
-
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_add_epi16(stp2_6, stp1_4);
-    const __m128i lo_6_6 = _mm_sub_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_sub_epi16(stp1_13, stp1_10);
-    const __m128i lo_10_14 = _mm_add_epi16(stp1_13, stp1_10);
-    const __m128i lo_11_12 = _mm_sub_epi16(stp1_12, stp1_11);
-    const __m128i lo_11_13 = _mm_add_epi16(stp1_12, stp1_11);
-
-    tmp1 = _mm_unpacklo_epi64(lo_6_5, lo_6_6);
-    tmp0 = _mm_unpacklo_epi64(lo_10_13, lo_10_14);
-    tmp4 = _mm_unpacklo_epi64(lo_11_12, lo_11_13);
-
-    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
-    tmp0   = _mm_mulhrs_epi16(tmp0, stg4_01);
-    tmp4   = _mm_mulhrs_epi16(tmp4, stg4_01);
-
-    stp2_10 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_13 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_11 = _mm_unpacklo_epi64(tmp4, zero);
-    stp2_12 = _mm_unpackhi_epi64(tmp4, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
-
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
-}
-
-static void idct16_10_r2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  const __m128i stg2_0 = dual_set_epi16(3212, 3212);
-  const __m128i stg2_1 = dual_set_epi16(32610, 32610);
-  const __m128i stg2_6 = dual_set_epi16(-9512, -9512);
-  const __m128i stg2_7 = dual_set_epi16(31358, 31358);
-  const __m128i stg3_0 = dual_set_epi16(6392, 6392);
-  const __m128i stg3_1 = dual_set_epi16(32138, 32138);
-  const __m128i stg4_01 = dual_set_epi16(23170, 23170);
-
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  __m128i stp1_0, stp1_2, stp1_3, stp1_5, stp1_6,
-          stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-          stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-          stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  /* Stage2 */
-  {
-    stp1_8_0  = _mm_mulhrs_epi16(in[1], stg2_0);
-    stp1_15   = _mm_mulhrs_epi16(in[1], stg2_1);
-    stp1_11   = _mm_mulhrs_epi16(in[3], stg2_6);
-    stp1_12_0 = _mm_mulhrs_epi16(in[3], stg2_7);
-  }
-
-  /* Stage3 */
-  {
-    stp2_4 = _mm_mulhrs_epi16(in[2], stg3_0);
-    stp2_7 = _mm_mulhrs_epi16(in[2], stg3_1);
-
-    stp1_9  =  stp1_8_0;
-    stp1_10 =  stp1_11;
-
-    stp1_13 = stp1_12_0;
-    stp1_14 = stp1_15;
-  }
-
-  /* Stage4 */
-  {
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
-    stp1_0 = _mm_mulhrs_epi16(in[0], stg4_01);
-
-    stp2_5 = stp2_4;
-    stp2_6 = stp2_7;
-
-
-    tmp0 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp1 = _mm_madd_epi16(hi_9_14, stg4_4);
-    tmp2 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp3 = _mm_madd_epi16(hi_9_14, stg4_5);
-    tmp4 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp5 = _mm_madd_epi16(hi_10_13, stg4_6);
-    tmp6 = _mm_madd_epi16(lo_10_13, stg4_7);
-    tmp7 = _mm_madd_epi16(hi_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, 14);
-    tmp1 = _mm_srai_epi32(tmp1, 14);
-    tmp2 = _mm_srai_epi32(tmp2, 14);
-    tmp3 = _mm_srai_epi32(tmp3, 14);
-    tmp4 = _mm_srai_epi32(tmp4, 14);
-    tmp5 = _mm_srai_epi32(tmp5, 14);
-    tmp6 = _mm_srai_epi32(tmp6, 14);
-    tmp7 = _mm_srai_epi32(tmp7, 14);
-
-    stp2_9 = _mm_packs_epi32(tmp0, tmp1);
-    stp2_14 = _mm_packs_epi32(tmp2, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp4, tmp5);
-    stp2_13 = _mm_packs_epi32(tmp6, tmp7);
-  }
-
-  /* Stage5 */
-  {
-    stp1_2 = stp1_0;
-    stp1_3 = stp1_0;
-
-    tmp0 = _mm_sub_epi16(stp2_6, stp2_5);
-    tmp1 = _mm_add_epi16(stp2_6, stp2_5);
-
-    stp1_5 = _mm_mulhrs_epi16(tmp0, stg4_01);
-    stp1_6 = _mm_mulhrs_epi16(tmp1, stg4_01);
-
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
-
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
-  }
-
-  /* Stage6 */
-  {
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
-    stp2_1 = _mm_add_epi16(stp1_0, stp1_6);
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
-
-    tmp0 = _mm_sub_epi16(stp1_13, stp1_10);
-    tmp1 = _mm_add_epi16(stp1_13, stp1_10);
-    tmp2 = _mm_sub_epi16(stp1_12, stp1_11);
-    tmp3 = _mm_add_epi16(stp1_12, stp1_11);
-
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
-    stp2_6 = _mm_sub_epi16(stp1_0, stp1_6);
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
-
-    stp2_10 = _mm_mulhrs_epi16(tmp0, stg4_01);
-    stp2_13 = _mm_mulhrs_epi16(tmp1, stg4_01);
-    stp2_11 = _mm_mulhrs_epi16(tmp2, stg4_01);
-    stp2_12 = _mm_mulhrs_epi16(tmp3, stg4_01);
-  }
-
-  // Stage7
-  in[0] = _mm_add_epi16(stp2_0, stp1_15);
-  in[1] = _mm_add_epi16(stp2_1, stp1_14);
-  in[2] = _mm_add_epi16(stp2_2, stp2_13);
-  in[3] = _mm_add_epi16(stp2_3, stp2_12);
-  in[4] = _mm_add_epi16(stp2_4, stp2_11);
-  in[5] = _mm_add_epi16(stp2_5, stp2_10);
-  in[6] = _mm_add_epi16(stp2_6, stp1_9);
-  in[7] = _mm_add_epi16(stp2_7, stp1_8);
-  in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  in[15] = _mm_sub_epi16(stp2_0, stp1_15);
-}
-
-void vp9_idct16x16_10_add_ssse3(const int16_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1<<5);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i in[16], l[16];
-
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = _mm_load_si128((const __m128i *)input);
-  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  idct16_10_r1(in, l);
-
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    array_transpose_4X8(l + 8*i, in);
-
-    idct16_10_r2(in);
-
-    // Final rounding and shift
-    in[0] = _mm_adds_epi16(in[0], final_rounding);
-    in[1] = _mm_adds_epi16(in[1], final_rounding);
-    in[2] = _mm_adds_epi16(in[2], final_rounding);
-    in[3] = _mm_adds_epi16(in[3], final_rounding);
-    in[4] = _mm_adds_epi16(in[4], final_rounding);
-    in[5] = _mm_adds_epi16(in[5], final_rounding);
-    in[6] = _mm_adds_epi16(in[6], final_rounding);
-    in[7] = _mm_adds_epi16(in[7], final_rounding);
-    in[8] = _mm_adds_epi16(in[8], final_rounding);
-    in[9] = _mm_adds_epi16(in[9], final_rounding);
-    in[10] = _mm_adds_epi16(in[10], final_rounding);
-    in[11] = _mm_adds_epi16(in[11], final_rounding);
-    in[12] = _mm_adds_epi16(in[12], final_rounding);
-    in[13] = _mm_adds_epi16(in[13], final_rounding);
-    in[14] = _mm_adds_epi16(in[14], final_rounding);
-    in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-    in[0] = _mm_srai_epi16(in[0], 6);
-    in[1] = _mm_srai_epi16(in[1], 6);
-    in[2] = _mm_srai_epi16(in[2], 6);
-    in[3] = _mm_srai_epi16(in[3], 6);
-    in[4] = _mm_srai_epi16(in[4], 6);
-    in[5] = _mm_srai_epi16(in[5], 6);
-    in[6] = _mm_srai_epi16(in[6], 6);
-    in[7] = _mm_srai_epi16(in[7], 6);
-    in[8] = _mm_srai_epi16(in[8], 6);
-    in[9] = _mm_srai_epi16(in[9], 6);
-    in[10] = _mm_srai_epi16(in[10], 6);
-    in[11] = _mm_srai_epi16(in[11], 6);
-    in[12] = _mm_srai_epi16(in[12], 6);
-    in[13] = _mm_srai_epi16(in[13], 6);
-    in[14] = _mm_srai_epi16(in[14], 6);
-    in[15] = _mm_srai_epi16(in[15], 6);
-
-    RECON_AND_STORE(dest, in[0]);
-    RECON_AND_STORE(dest, in[1]);
-    RECON_AND_STORE(dest, in[2]);
-    RECON_AND_STORE(dest, in[3]);
-    RECON_AND_STORE(dest, in[4]);
-    RECON_AND_STORE(dest, in[5]);
-    RECON_AND_STORE(dest, in[6]);
-    RECON_AND_STORE(dest, in[7]);
-    RECON_AND_STORE(dest, in[8]);
-    RECON_AND_STORE(dest, in[9]);
-    RECON_AND_STORE(dest, in[10]);
-    RECON_AND_STORE(dest, in[11]);
-    RECON_AND_STORE(dest, in[12]);
-    RECON_AND_STORE(dest, in[13]);
-    RECON_AND_STORE(dest, in[14]);
-    RECON_AND_STORE(dest, in[15]);
-
-    dest += 8 - (stride * 16);
-  }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
index 69b07f64575..22b5731886c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm
@@ -15,6 +15,11 @@ pw_4:  times 8 dw 4
 pw_8:  times 8 dw 8
 pw_16: times 8 dw 16
 pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4:  times 8 dw 2
+pw2_8:  times 8 dw 4
+pw2_16:  times 8 dw 8
+pw2_32:  times 8 dw 16
 
 SECTION .text
 
@@ -40,6 +45,46 @@ cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
   RET
 
 INIT_MMX sse
+cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
 cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
 
@@ -68,6 +113,91 @@ cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
   RESTORE_GOT
   RET
 
+INIT_MMX sse
+cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  pshufw                m0, m0, 0x0
+  packuswb              m0, m0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movd     m0,        [GLOBAL(dc_128)]
+  movd    [dstq          ], m0
+  movd    [dstq+strideq  ], m0
+  movd    [dstq+strideq*2], m0
+  movd    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_MMX sse
+cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    m0,        [GLOBAL(dc_128)]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
 INIT_XMM sse2
 cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
@@ -100,6 +230,91 @@ cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
   RESTORE_GOT
   REP_RET
 
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  pxor                  m2, m2
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  pxor                  m2, m2
+  mova                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+
 INIT_XMM sse2
 cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
   GET_GOT     goffsetq
@@ -142,6 +357,101 @@ cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
   RESTORE_GOT
   REP_RET
 
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  mova                  m2, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
 INIT_MMX sse
 cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
   movd                  m0, [aboveq]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
index 439c028f29d..0cb0912ad62 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -9,6 +9,7 @@
  */
 
 #include <immintrin.h>  /* AVX2 */
+#include "vpx_ports/mem.h"
 
 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
@@ -392,6 +393,11 @@ static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
     }
 }
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
 static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         const unsigned char *_blimit, const unsigned char *_limit,
         const unsigned char *_thresh) {
@@ -401,6 +407,9 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
     __m128i p7, p6, p5;
     __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
     __m128i q5, q6, q7;
+    __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+            q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+            p256_0, q256_0;
 
     const __m128i thresh = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _thresh[0]));
@@ -409,16 +418,37 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
     const __m128i blimit = _mm_broadcastb_epi8(
             _mm_cvtsi32_si128((int) _blimit[0]));
 
-    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
-    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
-    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
-    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
-    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
-    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
-    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
-    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
-    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
-    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+    p256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 5 * p)));
+    p256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 4 * p)));
+    p256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 3 * p)));
+    p256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 2 * p)));
+    p256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 1 * p)));
+    q256_0 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s - 0 * p)));
+    q256_1 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 1 * p)));
+    q256_2 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 2 * p)));
+    q256_3 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 3 * p)));
+    q256_4 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                (__m128d const *)(s + 4 * p)));
+
+    p4 = _mm256_castsi256_si128(p256_4);
+    p3 = _mm256_castsi256_si128(p256_3);
+    p2 = _mm256_castsi256_si128(p256_2);
+    p1 = _mm256_castsi256_si128(p256_1);
+    p0 = _mm256_castsi256_si128(p256_0);
+    q0 = _mm256_castsi256_si128(q256_0);
+    q1 = _mm256_castsi256_si128(q256_1);
+    q2 = _mm256_castsi256_si128(q256_2);
+    q3 = _mm256_castsi256_si128(q256_3);
+    q4 = _mm256_castsi256_si128(q256_4);
 
     {
         const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
@@ -534,23 +564,35 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
             flat = _mm_cmpeq_epi8(flat, zero);
             flat = _mm_and_si128(flat, mask);
 
-            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
-            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            p256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 6 * p)));
+            q256_5 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 5 * p)));
+            p5 = _mm256_castsi256_si128(p256_5);
+            q5 = _mm256_castsi256_si128(q256_5);
             flat2 = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
                     _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
 
             flat2 = _mm_max_epu8(work, flat2);
-            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
-            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            p256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 7 * p)));
+            q256_6 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 6 * p)));
+            p6 = _mm256_castsi256_si128(p256_6);
+            q6 = _mm256_castsi256_si128(q256_6);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
                     _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
 
             flat2 = _mm_max_epu8(work, flat2);
 
-            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
-            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            p256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s - 8 * p)));
+            q256_7 = _mm256_castpd_si256(_mm256_broadcast_pd(
+                                        (__m128d const *)(s + 7 * p)));
+            p7 = _mm256_castsi256_si128(p256_7);
+            q7 = _mm256_castsi256_si128(q256_7);
             work = _mm_max_epu8(
                     _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
                     _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
@@ -566,29 +608,28 @@ static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
         {
             const __m256i eight = _mm256_set1_epi16(8);
             const __m256i four = _mm256_set1_epi16(4);
-            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
-                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
-                    p256_0, q256_0;
             __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
                     pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
                     res_q;
 
-            p256_7 = _mm256_cvtepu8_epi16(p7);
-            p256_6 = _mm256_cvtepu8_epi16(p6);
-            p256_5 = _mm256_cvtepu8_epi16(p5);
-            p256_4 = _mm256_cvtepu8_epi16(p4);
-            p256_3 = _mm256_cvtepu8_epi16(p3);
-            p256_2 = _mm256_cvtepu8_epi16(p2);
-            p256_1 = _mm256_cvtepu8_epi16(p1);
-            p256_0 = _mm256_cvtepu8_epi16(p0);
-            q256_0 = _mm256_cvtepu8_epi16(q0);
-            q256_1 = _mm256_cvtepu8_epi16(q1);
-            q256_2 = _mm256_cvtepu8_epi16(q2);
-            q256_3 = _mm256_cvtepu8_epi16(q3);
-            q256_4 = _mm256_cvtepu8_epi16(q4);
-            q256_5 = _mm256_cvtepu8_epi16(q5);
-            q256_6 = _mm256_cvtepu8_epi16(q6);
-            q256_7 = _mm256_cvtepu8_epi16(q7);
+            const __m256i filter = _mm256_load_si256(
+                                  (__m256i const *)filt_loopfilter_avx2);
+            p256_7 = _mm256_shuffle_epi8(p256_7, filter);
+            p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+            p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+            p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+            p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+            p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+            p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+            p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+            q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+            q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+            q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+            q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+            q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+            q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+            q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+            q256_7 = _mm256_shuffle_epi8(q256_7, filter);
 
             pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
                     _mm256_add_epi16(p256_4, p256_3));
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 320328e2129..8723d32836d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -729,12 +729,12 @@ void vp9_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh, int count) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
@@ -948,12 +948,12 @@ void vp9_lpf_horizontal_8_dual_sse2(uint8_t *s, int p,
                                     const uint8_t *_blimit1,
                                     const uint8_t *_limit1,
                                     const uint8_t *_thresh1) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
+  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i blimit =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
@@ -1461,7 +1461,7 @@ void vp9_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   unsigned char *src[2];
   unsigned char *dst[2];
 
@@ -1484,7 +1484,7 @@ void vp9_lpf_vertical_8_sse2(unsigned char *s, int p,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh, int count) {
-  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
   unsigned char *src[1];
   unsigned char *dst[1];
   (void)count;
@@ -1511,7 +1511,7 @@ void vp9_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                   const uint8_t *blimit1,
                                   const uint8_t *limit1,
                                   const uint8_t *thresh1) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   unsigned char *src[2];
   unsigned char *dst[2];
 
@@ -1535,7 +1535,7 @@ void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
-  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+  DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
   unsigned char *src[2];
   unsigned char *dst[2];
 
@@ -1562,7 +1562,7 @@ void vp9_lpf_vertical_16_sse2(unsigned char *s, int p,
 void vp9_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
 
   // Transpose 16x16
   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
index 91055b9f9d4..f5f7d5af784 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -601,9 +601,6 @@ align 16
 t80:
     times 8 db 0x80
 align 16
-t1s:
-    times 8 db 0x01
-align 16
 t3:
     times 8 db 0x03
 align 16
@@ -612,15 +609,3 @@ t4:
 align 16
 ones:
     times 4 dw 0x0001
-align 16
-s27:
-    times 4 dw 0x1b00
-align 16
-s18:
-    times 4 dw 0x1200
-align 16
-s9:
-    times 4 dw 0x0900
-align 16
-s63:
-    times 4 dw 0x003f
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
new file mode 100644
index 00000000000..6029420d114
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -0,0 +1,287 @@
+;
+;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;  This file is a duplicate of mfqe_sse2.asm in VP8.
+;  TODO(jackychen): Find a way to fix the duplicate.
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp9_filter_by_weight16x16_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
+sym(vp9_filter_by_weight16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    SAVE_XMM 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 16                     ; loop count
+    pxor        xmm6, xmm6
+
+.combine
+    movdqa      xmm2, [rax]
+    movdqa      xmm4, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    movdqa      xmm3, xmm2
+    punpcklbw   xmm2, xmm6
+    punpckhbw   xmm3, xmm6
+    pmullw      xmm2, xmm0
+    pmullw      xmm3, xmm0
+
+    ; dst * dst_weight
+    movdqa      xmm5, xmm4
+    punpcklbw   xmm4, xmm6
+    punpckhbw   xmm5, xmm6
+    pmullw      xmm4, xmm1
+    pmullw      xmm5, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm4
+    paddw       xmm3, xmm5
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    paddw       xmm3, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+    psrlw       xmm3, 4
+
+    packuswb    xmm2, xmm3
+    movdqa      [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_filter_by_weight8x8_sse2
+;(
+;    unsigned char *src,
+;    int            src_stride,
+;    unsigned char *dst,
+;    int            dst_stride,
+;    int            src_weight
+;)
+global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
+sym(vp9_filter_by_weight8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    movd        xmm0, arg(4)                ; src_weight
+    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
+    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
+
+    movdqa      xmm1, [GLOBAL(tMFQE)]
+    psubw       xmm1, xmm0                  ; dst_weight
+
+    mov         rax, arg(0)                 ; src
+    mov         rsi, arg(1)                 ; src_stride
+    mov         rdx, arg(2)                 ; dst
+    mov         rdi, arg(3)                 ; dst_stride
+
+    mov         rcx, 8                      ; loop count
+    pxor        xmm4, xmm4
+
+.combine
+    movq        xmm2, [rax]
+    movq        xmm3, [rdx]
+    add         rax, rsi
+
+    ; src * src_weight
+    punpcklbw   xmm2, xmm4
+    pmullw      xmm2, xmm0
+
+    ; dst * dst_weight
+    punpcklbw   xmm3, xmm4
+    pmullw      xmm3, xmm1
+
+    ; sum, round and shift
+    paddw       xmm2, xmm3
+    paddw       xmm2, [GLOBAL(tMFQE_round)]
+    psrlw       xmm2, 4
+
+    packuswb    xmm2, xmm4
+    movq        [rdx], xmm2
+    add         rdx, rdi
+
+    dec         rcx
+    jnz         .combine
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+
+    ret
+
+;void vp9_variance_and_sad_16x16_sse2 | arg
+;(
+;    unsigned char *src1,          0
+;    int            stride1,       1
+;    unsigned char *src2,          2
+;    int            stride2,       3
+;    unsigned int  *variance,      4
+;    unsigned int  *sad,           5
+;)
+global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
+sym(vp9_variance_and_sad_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov         rax,        arg(0)          ; src1
+    mov         rcx,        arg(1)          ; stride1
+    mov         rdx,        arg(2)          ; src2
+    mov         rdi,        arg(3)          ; stride2
+
+    mov         rsi,        16              ; block height
+
+    ; Prep accumulator registers
+    pxor        xmm3, xmm3                  ; SAD
+    pxor        xmm4, xmm4                  ; sum of src2
+    pxor        xmm5, xmm5                  ; sum of src2^2
+
+    ; Because we're working with the actual output frames
+    ; we can't depend on any kind of data alignment.
+.accumulate
+    movdqa      xmm0, [rax]                 ; src1
+    movdqa      xmm1, [rdx]                 ; src2
+    add         rax, rcx                    ; src1 + stride1
+    add         rdx, rdi                    ; src2 + stride2
+
+    ; SAD(src1, src2)
+    psadbw      xmm0, xmm1
+    paddusw     xmm3, xmm0
+
+    ; SUM(src2)
+    pxor        xmm2, xmm2
+    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
+    paddusw     xmm4, xmm2
+
+    ; pmaddubsw would be ideal if it took two unsigned values. instead,
+    ; it expects a signed and an unsigned value. so instead we zero extend
+    ; and operate on words.
+    pxor        xmm2, xmm2
+    movdqa      xmm0, xmm1
+    punpcklbw   xmm0, xmm2
+    punpckhbw   xmm1, xmm2
+    pmaddwd     xmm0, xmm0
+    pmaddwd     xmm1, xmm1
+    paddd       xmm5, xmm0
+    paddd       xmm5, xmm1
+
+    sub         rsi,        1
+    jnz         .accumulate
+
+    ; phaddd only operates on adjacent double words.
+    ; Finalize SAD and store
+    movdqa      xmm0, xmm3
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm3
+    paddd       xmm0, [GLOBAL(t128)]
+    psrld       xmm0, 8
+
+    mov         rax,  arg(5)
+    movd        [rax], xmm0
+
+    ; Accumulate sum of src2
+    movdqa      xmm0, xmm4
+    psrldq      xmm0, 8
+    paddusw     xmm0, xmm4
+    ; Square src2. Ignore high value
+    pmuludq     xmm0, xmm0
+    psrld       xmm0, 8
+
+    ; phaddw could be used to sum adjacent values but we want
+    ; all the values summed. promote to doubles, accumulate,
+    ; shift and sum
+    pxor        xmm2, xmm2
+    movdqa      xmm1, xmm5
+    punpckldq   xmm1, xmm2
+    punpckhdq   xmm5, xmm2
+    paddd       xmm1, xmm5
+    movdqa      xmm2, xmm1
+    psrldq      xmm1, 8
+    paddd       xmm1, xmm2
+
+    psubd       xmm1, xmm0
+
+    ; (variance + 128) >> 8
+    paddd       xmm1, [GLOBAL(t128)]
+    psrld       xmm1, 8
+    mov         rax,  arg(4)
+
+    movd        [rax], xmm1
+
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+t128:
+%ifndef __NASM_VER__
+    ddq 128
+%elif CONFIG_BIG_ENDIAN
+    dq  0, 128
+%else
+    dq  128, 0
+%endif
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+    times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+    times 8 dw 0x08
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index c4efa6565f3..71dbb402dd4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -312,9 +312,11 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
                                          unsigned int out_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6;
+  __m128i addFilterReg64, filtersReg, minReg;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -333,27 +335,26 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
   // duplicate only the forth 16 bits in the filter
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
+  // load the first 7 rows of 8 bytes
+  srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
+  srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
+  srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
+  srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
+  srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
+  srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
+  srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+
   for (i = 0; i < output_height; i++) {
-    // load the first 8 bytes
-    srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]);
-    srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]);
+    // load the last 8 bytes
+    srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
 
     // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-
-    // load the next 8 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]);
-    srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]);
-    srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]);
-    srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]);
+    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
 
     // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6);
+    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
@@ -377,6 +378,15 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
 
     src_ptr+=src_pitch;
 
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
     // save only 8 bytes convolve result
     _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1);
 
@@ -390,9 +400,11 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
                                           unsigned int out_pitch,
                                           unsigned int output_height,
                                           int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
+  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
+  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
+  __m128i srcReg8;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -411,19 +423,24 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
   // duplicate only the forth 16 bits in the filter
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
+  // load the first 7 rows of 16 bytes
+  srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
+  srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+
   for (i = 0; i < output_height; i++) {
-    // load the first 16 bytes
-    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
-    // load the next 16 bytes in stride of src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
-    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+    // load the last 16 bytes
+    srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
 
     // merge the result together
-    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
-    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
-    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);
+    srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
+    srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8);
+    srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2);
+    srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
@@ -435,25 +452,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
 
-    // load the next 16 bytes in stride of two/three src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));
-
     // merge the result together
-    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4);
 
     // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
+    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
     srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);
 
-    // load the next 16 bytes in stride of four/five src_pitch
-    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
-    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));
-
     // merge the result together
-    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
-    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);
+    srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6);
 
     // multiply 2 adjacent elements with the filter and add the result
     srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
@@ -461,13 +470,13 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
 
     // add and saturate the results together
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
+                                 _mm_min_epi16(srcRegFilt3, srcRegFilt7));
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                  _mm_min_epi16(srcRegFilt6, srcRegFilt8));
 
     // add and saturate the results together
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
-                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
+                                 _mm_max_epi16(srcRegFilt3, srcRegFilt7));
     srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                  _mm_max_epi16(srcRegFilt6, srcRegFilt8));
     srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
@@ -484,6 +493,15 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
 
     src_ptr+=src_pitch;
 
+    // shift down a row
+    srcReg1 = srcReg2;
+    srcReg2 = srcReg3;
+    srcReg3 = srcReg4;
+    srcReg4 = srcReg5;
+    srcReg5 = srcReg6;
+    srcReg6 = srcReg7;
+    srcReg7 = srcReg8;
+
     // save 16 bytes convolve result
     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index fd781d4bc6a..4a5bf1b6003 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -18,7 +18,7 @@
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
@@ -661,7 +661,7 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     mov         rcx, 0x0400040
 
     movdqa      xmm4, [rdx]                 ;load filters
-    movd        xmm5, rcx
+    movq        xmm5, rcx
     packsswb    xmm4, xmm4
     pshuflw     xmm0, xmm4, 0b              ;k0_k1
     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
@@ -765,40 +765,50 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
 
     movq        xmm0,   [rsi - 3]           ;load src data
     movq        xmm4,   [rsi + 5]
-    movq        xmm7,   [rsi + 13]
+    movq        xmm6,   [rsi + 13]
     punpcklqdq  xmm0,   xmm4
-    punpcklqdq  xmm4,   xmm7
+    punpcklqdq  xmm4,   xmm6
+
+    movdqa      xmm7,   xmm0
 
+    punpcklbw   xmm7,   xmm7
+    punpckhbw   xmm0,   xmm0
     movdqa      xmm1,   xmm0
     movdqa      xmm2,   xmm0
     movdqa      xmm3,   xmm0
-    movdqa      xmm5,   xmm4
-    movdqa      xmm6,   xmm4
-    movdqa      xmm7,   xmm4
-
-    pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
-    pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
-    pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
-    pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
-    pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
 
+    palignr     xmm0,   xmm7, 1
+    palignr     xmm1,   xmm7, 5
     pmaddubsw   xmm0,   k0k1
+    palignr     xmm2,   xmm7, 9
     pmaddubsw   xmm1,   k2k3
+    palignr     xmm3,   xmm7, 13
+
     pmaddubsw   xmm2,   k4k5
     pmaddubsw   xmm3,   k6k7
-    pmaddubsw   xmm4,   k0k1
-    pmaddubsw   xmm5,   k2k3
-    pmaddubsw   xmm6,   k4k5
-    pmaddubsw   xmm7,   k6k7
-
     paddsw      xmm0,   xmm3
+
+    movdqa      xmm3,   xmm4
+    punpcklbw   xmm3,   xmm3
+    punpckhbw   xmm4,   xmm4
+
+    movdqa      xmm5,   xmm4
+    movdqa      xmm6,   xmm4
+    movdqa      xmm7,   xmm4
+
+    palignr     xmm4,   xmm3, 1
+    palignr     xmm5,   xmm3, 5
+    palignr     xmm6,   xmm3, 9
+    palignr     xmm7,   xmm3, 13
+
     movdqa      xmm3,   xmm1
+    pmaddubsw   xmm4,   k0k1
     pmaxsw      xmm1,   xmm2
+    pmaddubsw   xmm5,   k2k3
     pminsw      xmm2,   xmm3
+    pmaddubsw   xmm6,   k4k5
     paddsw      xmm0,   xmm2
+    pmaddubsw   xmm7,   k6k7
     paddsw      xmm0,   xmm1
 
     paddsw      xmm4,   xmm7
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
index dc712f04500..eb9b7971074 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -23,6 +23,7 @@
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -36,7 +37,6 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
-#include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_reader.h"
 
@@ -127,7 +127,7 @@ static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
 }
 
 static void read_frame_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
-  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_CONTEXT *const fc = cm->fc;
   int i;
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -181,14 +181,6 @@ static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
   }
 }
 
-static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
-  int i;
-  xd->plane[0].dequant = cm->y_dequant[q_index];
-
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    xd->plane[i].dequant = cm->uv_dequant[q_index];
-}
-
 static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
                                     TX_SIZE tx_size, uint8_t *dst, int stride,
                                     int eob) {
@@ -284,14 +276,14 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (eob == 1) {
-      vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+      memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
       else if (tx_size == TX_32X32 && eob <= 34)
-        vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
-        vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
     }
   }
 }
@@ -299,7 +291,9 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
 struct intra_args {
   VP9_COMMON *cm;
   MACROBLOCKD *xd;
+  FRAME_COUNTS *counts;
   vp9_reader *r;
+  int seg_id;
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -309,7 +303,7 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  MODE_INFO *const mi = xd->mi[0].src_mi;
+  MODE_INFO *const mi = xd->mi[0];
   const PREDICTION_MODE mode = (plane == 0) ? get_y_mode(mi, block)
                                             : mi->mbmi.uv_mode;
   int x, y;
@@ -323,9 +317,9 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
                           x, y, plane);
 
   if (!mi->mbmi.skip) {
-    const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
+    const int eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block,
                                             plane_bsize, x, y, tx_size,
-                                            args->r);
+                                            args->r, args->seg_id);
     inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
                             eob);
   }
@@ -335,7 +329,9 @@ struct inter_args {
   VP9_COMMON *cm;
   MACROBLOCKD *xd;
   vp9_reader *r;
+  FRAME_COUNTS *counts;
   int *eobtotal;
+  int seg_id;
 };
 
 static void reconstruct_inter_block(int plane, int block,
@@ -347,8 +343,8 @@ static void reconstruct_inter_block(int plane, int block,
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int x, y, eob;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  eob = vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y,
-                                tx_size, args->r);
+  eob = vp9_decode_block_tokens(cm, xd, args->counts, plane, block, plane_bsize,
+                                x, y, tx_size, args->r, args->seg_id);
   inverse_transform_block(xd, plane, block, tx_size,
                           &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
                           pd->dst.stride, eob);
@@ -365,13 +361,12 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   const int offset = mi_row * cm->mi_stride + mi_col;
   int x, y;
 
-  xd->mi = cm->mi + offset;
-  xd->mi[0].src_mi = &xd->mi[0];  // Point to self.
-  xd->mi[0].mbmi.sb_type = bsize;
-
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = &cm->mi[offset];
+  xd->mi[0]->mbmi.sb_type = bsize;
   for (y = 0; y < y_mis; ++y)
     for (x = !y; x < x_mis; ++x) {
-      xd->mi[y * cm->mi_stride + x].src_mi = &xd->mi[0];
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
     }
 
   set_skip_context(xd, mi_row, mi_col);
@@ -381,40 +376,38 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
   vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
-  return &xd->mi[0].mbmi;
+  return &xd->mi[0]->mbmi;
 }
 
-static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd,
+                         FRAME_COUNTS *counts,
                          const TileInfo *const tile,
                          int mi_row, int mi_col,
                          vp9_reader *r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
-  vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
+  vp9_read_mode_info(pbi, xd, counts, tile, mi_row, mi_col, r);
 
   if (less8x8)
     bsize = BLOCK_8X8;
 
   if (mbmi->skip) {
     reset_skip_context(xd, bsize);
-  } else {
-    if (cm->seg.enabled)
-      setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
-                                                  cm->base_qindex));
   }
 
   if (!is_inter_block(mbmi)) {
-    struct intra_args arg = { cm, xd, r };
+    struct intra_args arg = {cm, xd, counts, r, mbmi->segment_id};
     vp9_foreach_transformed_block(xd, bsize,
                                   predict_and_reconstruct_intra_block, &arg);
   } else {
     // Prediction
-    vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    vp9_dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col, bsize);
 
     // Reconstruction
     if (!mbmi->skip) {
       int eobtotal = 0;
-      struct inter_args arg = { cm, xd, r, &eobtotal };
+      struct inter_args arg = {cm, xd, r, counts, &eobtotal, mbmi->segment_id};
       vp9_foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
         mbmi->skip = 1;  // skip loopfilter
@@ -424,7 +417,8 @@ static void decode_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
-static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                     FRAME_COUNTS *counts, int hbs,
                                      int mi_row, int mi_col, BLOCK_SIZE bsize,
                                      vp9_reader *r) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -443,15 +437,17 @@ static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
     p = PARTITION_SPLIT;
 
   if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.partition[ctx][p];
+    ++counts->partition[ctx][p];
 
   return p;
 }
 
-static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd,
+                             FRAME_COUNTS *counts,
                              const TileInfo *const tile,
                              int mi_row, int mi_col,
                              vp9_reader* r, BLOCK_SIZE bsize) {
+  VP9_COMMON *const cm = &pbi->common;
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize, uv_subsize;
@@ -459,34 +455,37 @@ static void decode_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
+  partition = read_partition(cm, xd, counts, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   uv_subsize = ss_size_lookup[subsize][cm->subsampling_x][cm->subsampling_y];
   if (subsize >= BLOCK_8X8 && uv_subsize == BLOCK_INVALID)
-    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Invalid block size.");
+    vpx_internal_error(xd->error_info,
+                       VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   if (subsize < BLOCK_8X8) {
-    decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+    decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
         break;
       case PARTITION_HORZ:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
         if (mi_row + hbs < cm->mi_rows)
-          decode_block(cm, xd, tile, mi_row + hbs, mi_col, r, subsize);
+          decode_block(pbi, xd, counts, tile, mi_row + hbs, mi_col, r, subsize);
         break;
       case PARTITION_VERT:
-        decode_block(cm, xd, tile, mi_row, mi_col, r, subsize);
+        decode_block(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
         if (mi_col + hbs < cm->mi_cols)
-          decode_block(cm, xd, tile, mi_row, mi_col + hbs, r, subsize);
+          decode_block(pbi, xd, counts, tile, mi_row, mi_col + hbs, r, subsize);
         break;
       case PARTITION_SPLIT:
-        decode_partition(cm, xd, tile, mi_row,       mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row,       mi_col + hbs, r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col,       r, subsize);
-        decode_partition(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
+        decode_partition(pbi, xd, counts, tile, mi_row, mi_col, r, subsize);
+        decode_partition(pbi, xd, counts, tile, mi_row, mi_col + hbs, r,
+                         subsize);
+        decode_partition(pbi, xd, counts, tile, mi_row + hbs, mi_col, r,
+                         subsize);
+        decode_partition(pbi, xd, counts, tile, mi_row + hbs, mi_col + hbs, r,
+                         subsize);
         break;
       default:
         assert(0 && "Invalid partition type");
@@ -617,34 +616,54 @@ static void setup_loopfilter(struct loopfilter *lf,
   }
 }
 
-static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
-  const int old = *delta_q;
-  *delta_q = vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
-  return old != *delta_q;
+static INLINE int read_delta_q(struct vp9_read_bit_buffer *rb) {
+  return vp9_rb_read_bit(rb) ? vp9_rb_read_signed_literal(rb, 4) : 0;
 }
 
 static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                struct vp9_read_bit_buffer *rb) {
-  int update = 0;
-
   cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
-  update |= read_delta_q(rb, &cm->y_dc_delta_q);
-  update |= read_delta_q(rb, &cm->uv_dc_delta_q);
-  update |= read_delta_q(rb, &cm->uv_ac_delta_q);
-  if (update || cm->bit_depth != cm->dequant_bit_depth) {
-    vp9_init_dequantizer(cm);
-    cm->dequant_bit_depth = cm->bit_depth;
-  }
-
+  cm->y_dc_delta_q = read_delta_q(rb);
+  cm->uv_dc_delta_q = read_delta_q(rb);
+  cm->uv_ac_delta_q = read_delta_q(rb);
+  cm->dequant_bit_depth = cm->bit_depth;
   xd->lossless = cm->base_qindex == 0 &&
                  cm->y_dc_delta_q == 0 &&
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
+
 #if CONFIG_VP9_HIGHBITDEPTH
   xd->bd = (int)cm->bit_depth;
 #endif
 }
 
+static void setup_segmentation_dequant(VP9_COMMON *const cm) {
+  // Build y/uv dequant values based on segmentation.
+  if (cm->seg.enabled) {
+    int i;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      const int qindex = vp9_get_qindex(&cm->seg, i, cm->base_qindex);
+      cm->y_dequant[i][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q,
+                                         cm->bit_depth);
+      cm->y_dequant[i][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+      cm->uv_dequant[i][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+                                          cm->bit_depth);
+      cm->uv_dequant[i][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+                                          cm->bit_depth);
+    }
+  } else {
+    const int qindex = cm->base_qindex;
+    // When segmentation is disabled, only the first value is used.  The
+    // remaining are don't cares.
+    cm->y_dequant[0][0] = vp9_dc_quant(qindex, cm->y_dc_delta_q, cm->bit_depth);
+    cm->y_dequant[0][1] = vp9_ac_quant(qindex, 0, cm->bit_depth);
+    cm->uv_dequant[0][0] = vp9_dc_quant(qindex, cm->uv_dc_delta_q,
+                                        cm->bit_depth);
+    cm->uv_dequant[0][1] = vp9_ac_quant(qindex, cm->uv_ac_delta_q,
+                                        cm->bit_depth);
+  }
+}
+
 static INTERP_FILTER read_interp_filter(struct vp9_read_bit_buffer *rb) {
   const INTERP_FILTER literal_to_filter[] = { EIGHTTAP_SMOOTH,
                                               EIGHTTAP,
@@ -667,6 +686,14 @@ static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
     vp9_read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
+static void resize_mv_buffer(VP9_COMMON *cm) {
+  vpx_free(cm->cur_frame->mvs);
+  cm->cur_frame->mi_rows = cm->mi_rows;
+  cm->cur_frame->mi_cols = cm->mi_cols;
+  cm->cur_frame->mvs = (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                            sizeof(*cm->cur_frame->mvs));
+}
+
 static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
@@ -692,14 +719,20 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
     cm->width = width;
     cm->height = height;
   }
+  if (cm->cur_frame->mvs == NULL || cm->mi_rows > cm->cur_frame->mi_rows ||
+      cm->mi_cols > cm->cur_frame->mi_cols) {
+    resize_mv_buffer(cm);
+  }
 }
 
 static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   int width, height;
+  BufferPool *const pool = cm->buffer_pool;
   vp9_read_frame_size(rb, &width, &height);
   resize_context_buffers(cm, width, height);
   setup_display_size(cm, rb);
 
+  lock_buffer_pool(pool);
   if (vp9_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height,
           cm->subsampling_x, cm->subsampling_y,
@@ -707,14 +740,19 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
           cm->use_highbitdepth,
 #endif
           VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
-  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
-  cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
 }
 
 static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
@@ -730,15 +768,12 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   int width, height;
   int found = 0, i;
   int has_valid_ref_frame = 0;
+  BufferPool *const pool = cm->buffer_pool;
   for (i = 0; i < REFS_PER_FRAME; ++i) {
     if (vp9_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
-      if (buf->corrupted) {
-        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                           "Frame reference is corrupt");
-      }
       found = 1;
       break;
     }
@@ -772,12 +807,13 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
             cm->subsampling_x,
             cm->subsampling_y))
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Referenced frame has incompatible color space");
+                         "Referenced frame has incompatible color format");
   }
 
   resize_context_buffers(cm, width, height);
   setup_display_size(cm, rb);
 
+  lock_buffer_pool(pool);
   if (vp9_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height,
           cm->subsampling_x, cm->subsampling_y,
@@ -785,14 +821,19 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
           cm->use_highbitdepth,
 #endif
           VP9_DEC_BORDER_IN_PIXELS,
-          &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer, cm->get_fb_cb,
-          cm->cb_priv)) {
+          cm->byte_alignment,
+          &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+          pool->cb_priv)) {
+    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
-  cm->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
-  cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  unlock_buffer_pool(pool);
+
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
+  pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
+  pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+  pool->frame_bufs[cm->new_fb_idx].buf.color_space = cm->color_space;
 }
 
 static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -902,12 +943,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     // Be sure to sync as we might be resuming after a failed frame decode.
     winterface->sync(&pbi->lf_worker);
-    lf_data->frame_buffer = get_frame_new_buffer(cm);
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
-    lf_data->stop = 0;
-    lf_data->y_only = 0;
-    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+    vp9_loop_filter_data_reset(lf_data, get_frame_new_buffer(cm), cm,
+                               pbi->mb.plane);
   }
 
   assert(tile_rows <= 4);
@@ -915,11 +952,11 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context, 0,
-             sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
 
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(*cm->above_seg_context) * aligned_cols);
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_cols);
 
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
 
@@ -947,7 +984,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
       init_macroblockd(cm, &tile_data->xd);
-      vp9_zero(tile_data->xd.dqcoeff);
     }
   }
 
@@ -965,13 +1001,16 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data->cm, &tile_data->xd, &tile, mi_row, mi_col,
-                           &tile_data->bit_reader, BLOCK_64X64);
+          decode_partition(pbi, &tile_data->xd, &cm->counts, &tile, mi_row,
+                           mi_col, &tile_data->bit_reader, BLOCK_64X64);
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
+        if (pbi->mb.corrupted)
+            vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                               "Failed to decode tile data");
       }
       // Loopfilter one row.
-      if (cm->lf.filter_level && !pbi->mb.corrupted) {
+      if (cm->lf.filter_level) {
         const int lf_start = mi_row - MI_BLOCK_SIZE;
         LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
 
@@ -990,11 +1029,17 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
           winterface->execute(&pbi->lf_worker);
         }
       }
+      // After loopfiltering, the last 7 row pixels in each superblock row may
+      // still be changed by the longest loopfilter of the next superblock
+      // row.
+      if (pbi->frame_parallel_decode)
+        vp9_frameworker_broadcast(pbi->cur_buf,
+                                  mi_row << MI_BLOCK_SIZE_LOG2);
     }
   }
 
   // Loopfilter remaining rows in the frame.
-  if (cm->lf.filter_level && !pbi->mb.corrupted) {
+  if (cm->lf.filter_level) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
     winterface->sync(&pbi->lf_worker);
     lf_data->start = lf_data->stop;
@@ -1005,6 +1050,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
   // Get last tile data.
   tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
 
+  if (pbi->frame_parallel_decode)
+    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
   return vp9_reader_find_end(&tile_data->bit_reader);
 }
 
@@ -1012,14 +1059,24 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
                             const TileInfo *const tile) {
   int mi_row, mi_col;
 
+  if (setjmp(tile_data->error_info.jmp)) {
+    tile_data->error_info.setjmp = 0;
+    tile_data->xd.corrupted = 1;
+    return 0;
+  }
+
+  tile_data->error_info.setjmp = 1;
+  tile_data->xd.error_info = &tile_data->error_info;
+
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
     vp9_zero(tile_data->xd.left_context);
     vp9_zero(tile_data->xd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->cm, &tile_data->xd, tile,
-                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
+      decode_partition(tile_data->pbi, &tile_data->xd, &tile_data->counts,
+                       tile, mi_row, mi_col, &tile_data->bit_reader,
+                       BLOCK_64X64);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1029,13 +1086,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data,
 static int compare_tile_buffers(const void *a, const void *b) {
   const TileBuffer *const buf1 = (const TileBuffer*)a;
   const TileBuffer *const buf2 = (const TileBuffer*)b;
-  if (buf1->size < buf2->size) {
-    return 1;
-  } else if (buf1->size == buf2->size) {
-    return 0;
-  } else {
-    return -1;
-  }
+  return (int)(buf2->size - buf1->size);
 }
 
 static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
@@ -1065,14 +1116,19 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
     // use num_threads - 1 workers.
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
+    // Ensure tile data offsets will be properly aligned. This may fail on
+    // platforms without DECLARE_ALIGNED().
+    assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_data,
+                    vpx_memalign(32, num_threads *
+                                 sizeof(*pbi->tile_worker_data)));
+    CHECK_MEM_ERROR(cm, pbi->tile_worker_info,
+                    vpx_malloc(num_threads * sizeof(*pbi->tile_worker_info)));
     for (i = 0; i < num_threads; ++i) {
       VP9Worker *const worker = &pbi->tile_workers[i];
       ++pbi->num_tile_workers;
 
       winterface->init(worker);
-      CHECK_MEM_ERROR(cm, worker->data1,
-                      vpx_memalign(32, sizeof(TileWorkerData)));
-      CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
       if (i < num_threads - 1 && !winterface->reset(worker)) {
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
@@ -1082,16 +1138,19 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
 
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
-    winterface->sync(&pbi->tile_workers[n]);
-    pbi->tile_workers[n].hook = (VP9WorkerHook)tile_worker_hook;
+    VP9Worker *const worker = &pbi->tile_workers[n];
+    winterface->sync(worker);
+    worker->hook = (VP9WorkerHook)tile_worker_hook;
+    worker->data1 = &pbi->tile_worker_data[n];
+    worker->data2 = &pbi->tile_worker_info[n];
   }
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(cm->above_context, 0,
-             sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  vpx_memset(cm->above_seg_context, 0,
-             sizeof(*cm->above_seg_context) * aligned_mi_cols);
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_mi_cols);
 
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
@@ -1116,6 +1175,17 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
     }
   }
 
+  // Initialize thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    int i;
+
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[i].data1;
+      vp9_zero(tile_data->counts);
+    }
+  }
+
   n = 0;
   while (n < tile_cols) {
     int i;
@@ -1125,15 +1195,14 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
       TileInfo *const tile = (TileInfo*)worker->data2;
       TileBuffer *const buf = &tile_buffers[0][n];
 
-      tile_data->cm = cm;
+      tile_data->pbi = pbi;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      vp9_tile_init(tile, tile_data->cm, 0, buf->col);
+      vp9_tile_init(tile, cm, 0, buf->col);
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
       init_macroblockd(cm, &tile_data->xd);
-      vp9_zero(tile_data->xd.dqcoeff);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || n == tile_cols - 1) {
@@ -1151,6 +1220,10 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
 
     for (; i > 0; --i) {
       VP9Worker *const worker = &pbi->tile_workers[i - 1];
+      // TODO(jzern): The tile may have specific error data associated with
+      // its vpx_internal_error_info which could be propagated to the main info
+      // in cm. Additionally once the threads have been synced and an error is
+      // detected, there's no point in continuing to decode tiles.
       pbi->mb.corrupted |= !winterface->sync(worker);
     }
     if (final_worker > -1) {
@@ -1159,6 +1232,15 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi,
       bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader);
       final_worker = -1;
     }
+
+    // Accumulate thread frame counts.
+    if (n >= tile_cols && !cm->frame_parallel_decoding_mode) {
+      for (i = 0; i < num_workers; ++i) {
+        TileWorkerData *const tile_data =
+            (TileWorkerData*)pbi->tile_workers[i].data1;
+        vp9_accumulate_frame_counts(cm, &tile_data->counts, 1);
+      }
+    }
   }
 
   return bit_reader_end;
@@ -1196,8 +1278,8 @@ static void read_bitdepth_colorspace_sampling(
     cm->use_highbitdepth = 0;
 #endif
   }
-  cm->color_space = (COLOR_SPACE)vp9_rb_read_literal(rb, 3);
-  if (cm->color_space != SRGB) {
+  cm->color_space = vp9_rb_read_literal(rb, 3);
+  if (cm->color_space != VPX_CS_SRGB) {
     vp9_rb_read_bit(rb);  // [16,235] (including xvycc) vs [0,255] range
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -1229,8 +1311,10 @@ static void read_bitdepth_colorspace_sampling(
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                        struct vp9_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  BufferPool *const pool = pbi->common.buffer_pool;
+  int i, mask, ref_index = 0;
   size_t sz;
-  int i;
 
   cm->last_frame_type = cm->frame_type;
 
@@ -1248,16 +1332,24 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)];
-
-    if (frame_to_show < 0 || cm->frame_bufs[frame_to_show].ref_count < 1)
+    lock_buffer_pool(pool);
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      unlock_buffer_pool(pool);
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
+    }
 
-    ref_cnt_fb(cm->frame_bufs, &cm->new_fb_idx, frame_to_show);
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+    unlock_buffer_pool(pool);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
+
+    if (pbi->frame_parallel_decode) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
+    }
     return 0;
   }
 
@@ -1274,12 +1366,15 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
     for (i = 0; i < REFS_PER_FRAME; ++i) {
-      cm->frame_refs[i].idx = -1;
+      cm->frame_refs[i].idx = INVALID_IDX;
       cm->frame_refs[i].buf = NULL;
     }
 
     setup_frame_size(cm, rb);
-    pbi->need_resync = 0;
+    if (pbi->need_resync) {
+      memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      pbi->need_resync = 0;
+    }
   } else {
     cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
 
@@ -1295,9 +1390,9 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
       } else {
         // NOTE: The intra-only frame header does not include the specification
         // of either the color format or color sub-sampling in profile 0. VP9
-        // specifies that the default color space should be YUV 4:2:0 in this
+        // specifies that the default color format should be YUV 4:2:0 in this
         // case (normative).
-        cm->color_space = BT_601;
+        cm->color_space = VPX_CS_BT_601;
         cm->subsampling_y = cm->subsampling_x = 1;
         cm->bit_depth = VPX_BITS_8;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1307,15 +1402,18 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
 
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
-      pbi->need_resync = 0;
-    } else {
+      if (pbi->need_resync) {
+        memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+        pbi->need_resync = 0;
+      }
+    } else if (pbi->need_resync != 1) {  /* Skip if need resync */
       pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
       for (i = 0; i < REFS_PER_FRAME; ++i) {
         const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
         RefBuffer *const ref_frame = &cm->frame_refs[i];
         ref_frame->idx = idx;
-        ref_frame->buf = &cm->frame_bufs[idx].buf;
+        ref_frame->buf = &frame_bufs[idx].buf;
         cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
       }
 
@@ -1338,14 +1436,13 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
                                           ref_buf->buf->y_crop_height,
                                           cm->width, cm->height);
 #endif
-        if (vp9_is_scaled(&ref_buf->sf))
-          vp9_extend_frame_borders(ref_buf->buf);
       }
     }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
   get_frame_new_buffer(cm)->bit_depth = cm->bit_depth;
 #endif
+  get_frame_new_buffer(cm)->color_space = cm->color_space;
 
   if (pbi->need_resync) {
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
@@ -1365,12 +1462,37 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   // below, forcing the use of context 0 for those frame types.
   cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
+  // Generate next_ref_frame_map.
+  lock_buffer_pool(pool);
+  for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+    if (mask & 1) {
+      cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+      ++frame_bufs[cm->new_fb_idx].ref_count;
+    } else {
+      cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    }
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+    ++ref_index;
+  }
+
+  for (; ref_index < REF_FRAMES; ++ref_index) {
+    cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+    // Current thread holds the reference frame.
+    if (cm->ref_frame_map[ref_index] >= 0)
+      ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 1;
+
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp9_setup_past_independence(cm);
 
   setup_loopfilter(&cm->lf, rb);
   setup_quantization(cm, &pbi->mb, rb);
   setup_segmentation(&cm->seg, rb);
+  setup_segmentation_dequant(cm);
 
   setup_tile_info(cm, rb);
   sz = vp9_rb_read_literal(rb, 16);
@@ -1386,7 +1508,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
                                   size_t partition_size) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  FRAME_CONTEXT *const fc = &cm->fc;
+  FRAME_CONTEXT *const fc = cm->fc;
   vp9_reader r;
   int k;
 
@@ -1434,18 +1556,6 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
   return vp9_reader_has_error(&r);
 }
 
-void vp9_init_dequantizer(VP9_COMMON *cm) {
-  int q;
-
-  for (q = 0; q < QINDEX_RANGE; q++) {
-    cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth);
-    cm->y_dequant[q][1] = vp9_ac_quant(q, 0, cm->bit_depth);
-
-    cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth);
-    cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth);
-  }
-}
-
 #ifdef NDEBUG
 #define debug_check_frame_counts(cm) (void)0
 #else  // !NDEBUG
@@ -1510,7 +1620,7 @@ void vp9_decode_frame(VP9Decoder *pbi,
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   struct vp9_read_bit_buffer rb = { NULL, NULL, 0, NULL, 0};
-
+  int context_updated = 0;
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
       init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
@@ -1530,40 +1640,68 @@ void vp9_decode_frame(VP9Decoder *pbi,
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Truncated packet or corrupt header length");
 
-  init_macroblockd(cm, &pbi->mb);
-
-  if (!cm->error_resilient_mode)
-    set_prev_mi(cm);
-  else
-    cm->prev_mi = NULL;
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
 
-  setup_plane_dequants(cm, xd, cm->base_qindex);
   vp9_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
-  cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  *cm->fc = cm->frame_contexts[cm->frame_context_idx];
+  if (!cm->fc->initialized)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Uninitialized entropy context.");
+
   vp9_zero(cm->counts);
-  vp9_zero(xd->dqcoeff);
 
   xd->corrupted = 0;
   new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
+  if (new_fb->corrupted)
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Decode failed. Frame data header is corrupted.");
+
+  if (cm->lf.filter_level) {
+    vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
+  }
 
-  // TODO(jzern): remove frame_parallel_decoding_mode restriction for
-  // single-frame tile decoding.
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
-      cm->frame_parallel_decoding_mode) {
+  // If encoded in frame parallel mode, frame context is ready after decoding
+  // the frame header.
+  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    if (cm->refresh_frame_context) {
+      context_updated = 1;
+      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+    }
+    vp9_frameworker_lock_stats(worker);
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    frame_worker_data->frame_context_ready = 1;
+    // Signal the main thread that context is ready.
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  }
+
+  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+    // Multi-threaded tile decoder
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
     if (!xd->corrupted) {
       // If multiple threads are used to decode tiles, then we use those threads
       // to do parallel loopfiltering.
-      vp9_loop_filter_frame_mt(new_fb, pbi, cm, cm->lf.filter_level, 0);
+      vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
+                               0, 0, pbi->tile_workers, pbi->num_tile_workers,
+                               &pbi->lf_row_sync);
+    } else {
+      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                         "Decode failed. Frame data is corrupted.");
+
     }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
 
-  new_fb->corrupted |= xd->corrupted;
-
-  if (!new_fb->corrupted) {
+  if (!xd->corrupted) {
     if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
       vp9_adapt_coef_probs(cm);
 
@@ -1579,6 +1717,331 @@ void vp9_decode_frame(VP9Decoder *pbi,
                        "Decode failed. Frame data is corrupted.");
   }
 
-  if (cm->refresh_frame_context)
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+  // Non frame parallel update frame context here.
+  if (cm->refresh_frame_context && !context_updated)
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+}
+
+static void build_mc_border(const uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void high_build_mc_border(const uint8_t *src8, int src_stride,
+                                 uint16_t *dst, int dst_stride,
+                                 int x, int y, int b_w, int b_h,
+                                 int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *ref_row = src - x - y * src_stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * src_stride;
+  else if (y > 0)
+    ref_row += y * src_stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      vpx_memset16(dst, ref_row[0], left);
+
+    if (copy)
+      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+    if (right)
+      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
+
+    dst += dst_stride;
+    ++y;
+
+    if (y > 0 && y < h)
+      ref_row += src_stride;
+  } while (--b_h);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                int plane, int bw, int bh, int x,
+                                int y, int w, int h, int mi_x, int mi_y,
+                                const InterpKernel *kernel,
+                                const struct scale_factors *sf,
+                                struct buf_2d *pre_buf, struct buf_2d *dst_buf,
+                                const MV* mv, RefCntBuffer *ref_frame_buf,
+                                int is_scaled, int ref) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  MV32 scaled_mv;
+  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+      buf_stride, subpel_x, subpel_y;
+  uint8_t *ref_frame, *buf_ptr;
+
+  // Get reference frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = ref_frame_buf->buf.y_crop_width;
+    frame_height = ref_frame_buf->buf.y_crop_height;
+    ref_frame = ref_frame_buf->buf.y_buffer;
+  } else {
+    frame_width = ref_frame_buf->buf.uv_crop_width;
+    frame_height = ref_frame_buf->buf.uv_crop_height;
+    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                         : ref_frame_buf->buf.v_buffer;
+  }
+
+  if (is_scaled) {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+    // Co-ordinate of containing block to pixel precision.
+    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = (x_start + x) << SUBPEL_BITS;
+    y0_16 = (y_start + y) << SUBPEL_BITS;
+
+    // Co-ordinate of current block in reference frame
+    // to 1/16th pixel precision.
+    x0_16 = sf->scale_value_x(x0_16, sf);
+    y0_16 = sf->scale_value_y(y0_16, sf);
+
+    // Map the top left corner of the block into the reference frame.
+    x0 = sf->scale_value_x(x_start + x, sf);
+    y0 = sf->scale_value_y(y_start + y, sf);
+
+    // Scale the MV and incorporate the sub-pixel offset of the block
+    // in the reference frame.
+    scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    // Co-ordinate of containing block to pixel precision.
+    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = x0 << SUBPEL_BITS;
+    y0_16 = y0 << SUBPEL_BITS;
+
+    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+    xs = ys = 16;
+  }
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+  // Calculate the top left corner of the best matching block in the
+  // reference frame.
+  x0 += scaled_mv.col >> SUBPEL_BITS;
+  y0 += scaled_mv.row >> SUBPEL_BITS;
+  x0_16 += scaled_mv.col;
+  y0_16 += scaled_mv.row;
+
+  // Get reference block pointer.
+  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+  buf_stride = pre_buf->stride;
+
+  // Do border extension if there is motion or the
+  // width/height is not a multiple of 8 pixels.
+  if (is_scaled || scaled_mv.col || scaled_mv.row ||
+      (frame_width & 0x7) || (frame_height & 0x7)) {
+    int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+
+    // Get reference block bottom right horizontal coordinate.
+    int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS;
+    int x_pad = 0, y_pad = 0;
+
+    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      x0 -= VP9_INTERP_EXTEND - 1;
+      x1 += VP9_INTERP_EXTEND;
+      x_pad = 1;
+    }
+
+    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      y0 -= VP9_INTERP_EXTEND - 1;
+      y1 += VP9_INTERP_EXTEND;
+      y_pad = 1;
+    }
+
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+    if (pbi->frame_parallel_decode)
+      vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                           MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+
+    // Skip border extension if block is inside the frame.
+    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+      uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+      // Extend the border.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        high_build_mc_border(buf_ptr1,
+                             pre_buf->stride,
+                             xd->mc_buf_high,
+                             x1 - x0 + 1,
+                             x0,
+                             y0,
+                             x1 - x0 + 1,
+                             y1 - y0 + 1,
+                             frame_width,
+                             frame_height);
+        buf_stride = x1 - x0 + 1;
+        buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
+            y_pad * 3 * buf_stride + x_pad * 3;
+      } else {
+        build_mc_border(buf_ptr1,
+                        pre_buf->stride,
+                        xd->mc_buf,
+                        x1 - x0 + 1,
+                        x0,
+                        y0,
+                        x1 - x0 + 1,
+                        y1 - y0 + 1,
+                        frame_width,
+                        frame_height);
+        buf_stride = x1 - x0 + 1;
+        buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+      }
+#else
+      build_mc_border(buf_ptr1,
+                      pre_buf->stride,
+                      xd->mc_buf,
+                      x1 - x0 + 1,
+                      x0,
+                      y0,
+                      x1 - x0 + 1,
+                      y1 - y0 + 1,
+                      frame_width,
+                      frame_height);
+      buf_stride = x1 - x0 + 1;
+      buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  } else {
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+     if (pbi->frame_parallel_decode) {
+       const int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+       vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                            MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+     }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
+    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, sf, w, h, ref, kernel, xs, ys);
+  }
+#else
+  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                                       int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  const MODE_INFO *mi = xd->mi[0];
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+  const int is_compound = has_second_ref(&mi->mbmi);
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    struct buf_2d *const dst_buf = &pd->dst;
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+    int ref;
+
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+      const int idx = xd->block_refs[ref]->idx;
+      BufferPool *const pool = pbi->common.buffer_pool;
+      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+      const int is_scaled = vp9_is_scaled(sf);
+
+      if (sb_type < BLOCK_8X8) {
+        int i = 0, x, y;
+        assert(bsize == BLOCK_8X8);
+        for (y = 0; y < num_4x4_h; ++y) {
+          for (x = 0; x < num_4x4_w; ++x) {
+            const MV mv = average_split_mvs(pd, mi, ref, i++);
+            dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+                                       sf, pre_buf, dst_buf, &mv,
+                                       ref_frame_buf, is_scaled, ref);
+          }
+        }
+      } else {
+        const MV mv = mi->mbmi.mv[ref].as_mv;
+        dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+                                   0, 0, bw, bh, mi_x, mi_y, kernel,
+                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+                                   is_scaled, ref);
+      }
+    }
+  }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h
index 10a9e34629b..8410c541e45 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.h
@@ -31,6 +31,9 @@ void vp9_read_frame_size(struct vp9_read_bit_buffer *rb,
                          int *width, int *height);
 BITSTREAM_PROFILE vp9_read_profile(struct vp9_read_bit_buffer *rb);
 
+void vp9_dec_build_inter_predictors_sb(struct VP9Decoder *const pbi,
+                                       MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
index a01fe842ee2..ce6ff997778 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c
@@ -27,29 +27,31 @@ static PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
   return (PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
 }
 
-static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
-                                            int size_group) {
+static PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, FRAME_COUNTS *counts,
+                                         vp9_reader *r, int size_group) {
   const PREDICTION_MODE y_mode =
-      read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
+      read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
   if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.y_mode[size_group][y_mode];
+    ++counts->y_mode[size_group][y_mode];
   return y_mode;
 }
 
-static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
+static PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, FRAME_COUNTS *counts,
+                                          vp9_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode = read_intra_mode(r,
-                                         cm->fc.uv_mode_prob[y_mode]);
+                                         cm->fc->uv_mode_prob[y_mode]);
   if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.uv_mode[y_mode][uv_mode];
+    ++counts->uv_mode[y_mode][uv_mode];
   return uv_mode;
 }
 
-static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r, int ctx) {
+static PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, FRAME_COUNTS *counts,
+                                       vp9_reader *r, int ctx) {
   const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
-                                 cm->fc.inter_mode_probs[ctx]);
+                                 cm->fc->inter_mode_probs[ctx]);
   if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.inter_mode[ctx][mode];
+    ++counts->inter_mode[ctx][mode];
 
   return NEARESTMV + mode;
 }
@@ -59,9 +61,10 @@ static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                                     FRAME_COUNTS *counts,
                                      TX_SIZE max_tx_size, vp9_reader *r) {
   const int ctx = vp9_get_tx_size_context(xd);
-  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs);
+  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
   int tx_size = vp9_read(r, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     tx_size += vp9_read(r, tx_probs[1]);
@@ -70,15 +73,18 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
   }
 
   if (!cm->frame_parallel_decoding_mode)
-    ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size];
+    ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
   return (TX_SIZE)tx_size;
 }
 
-static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode,
-                            BLOCK_SIZE bsize, int allow_select, vp9_reader *r) {
+static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+                            FRAME_COUNTS *counts,
+                            int allow_select, vp9_reader *r) {
+  TX_MODE tx_mode = cm->tx_mode;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
-    return read_selected_tx_size(cm, xd, max_tx_size, r);
+    return read_selected_tx_size(cm, xd, counts, max_tx_size, r);
   else
     return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }
@@ -96,21 +102,40 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
 
   for (y = 0; y < ymis; y++)
     for (x = 0; x < xmis; x++)
-      cm->last_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+      cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
 }
 
-static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+static void copy_segment_id(const VP9_COMMON *cm,
+                           const uint8_t *last_segment_ids,
+                           uint8_t *current_segment_ids,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  for (y = 0; y < ymis; y++)
+    for (x = 0; x < xmis; x++)
+      current_segment_ids[mi_offset + y * cm->mi_cols + x] =  last_segment_ids ?
+          last_segment_ids[mi_offset + y * cm->mi_cols + x] : 0;
+}
+
+static int read_intra_segment_id(VP9_COMMON *const cm, BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
                                  vp9_reader *r) {
   struct segmentation *const seg = &cm->seg;
-  const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
   int segment_id;
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
-  if (!seg->update_map)
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    bsize, mi_row, mi_col);
     return 0;
+  }
 
   segment_id = read_segment_id(r, seg);
   set_segment_id(cm, bsize, mi_row, mi_col, segment_id);
@@ -120,17 +145,21 @@ static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                  int mi_row, int mi_col, vp9_reader *r) {
   struct segmentation *const seg = &cm->seg;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int predicted_segment_id, segment_id;
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
-  predicted_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
-                                            bsize, mi_row, mi_col);
-  if (!seg->update_map)
+  predicted_segment_id = cm->last_frame_seg_map ?
+      vp9_get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col) : 0;
+
+  if (!seg->update_map) {
+    copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+                    bsize, mi_row, mi_col);
     return predicted_segment_id;
+  }
 
   if (seg->temporal_update) {
     const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
@@ -145,31 +174,33 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 }
 
 static int read_skip(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                     FRAME_COUNTS *counts,
                      int segment_id, vp9_reader *r) {
   if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp9_get_skip_context(xd);
-    const int skip = vp9_read(r, cm->fc.skip_probs[ctx]);
+    const int skip = vp9_read(r, cm->fc->skip_probs[ctx]);
     if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.skip[ctx][skip];
+      ++counts->skip[ctx][skip];
     return skip;
   }
 }
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
+                                       FRAME_COUNTS *counts,
                                        int mi_row, int mi_col, vp9_reader *r) {
-  MODE_INFO *const mi = xd->mi[0].src_mi;
+  MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  const MODE_INFO *above_mi = xd->mi[-cm->mi_stride].src_mi;
-  const MODE_INFO *left_mi  = xd->left_available ? xd->mi[-1].src_mi : NULL;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi  = xd->left_mi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   int i;
 
-  mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, bsize, 1, r);
+  mbmi->segment_id = read_intra_segment_id(cm, bsize, mi_row, mi_col, r);
+  mbmi->skip = read_skip(cm, xd, counts, mbmi->segment_id, r);
+  mbmi->tx_size = read_tx_size(cm, xd, counts, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
   mbmi->ref_frame[1] = NONE;
 
@@ -254,13 +285,14 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
 
 static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
                                                 const MACROBLOCKD *xd,
+                                                FRAME_COUNTS *counts,
                                                 vp9_reader *r) {
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = vp9_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+        (REFERENCE_MODE)vp9_read(r, cm->fc->comp_inter_prob[ctx]);
     if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.comp_inter[ctx][mode];
+      ++counts->comp_inter[ctx][mode];
     return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
   } else {
     return cm->reference_mode;
@@ -269,17 +301,16 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
 
 // Read the referncence frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                            vp9_reader *r,
+                            FRAME_COUNTS *counts, vp9_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
-  FRAME_CONTEXT *const fc = &cm->fc;
-  FRAME_COUNTS *const counts = &cm->counts;
+  FRAME_CONTEXT *const fc = cm->fc;
 
   if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     ref_frame[0] = (MV_REFERENCE_FRAME)vp9_get_segdata(&cm->seg, segment_id,
                                                        SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
-    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+    const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, counts, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
     if (mode == COMPOUND_REFERENCE) {
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
@@ -313,17 +344,19 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 
 
 static INLINE INTERP_FILTER read_switchable_interp_filter(
-    VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
+    VP9_COMMON *const cm, MACROBLOCKD *const xd,
+    FRAME_COUNTS *counts, vp9_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   const INTERP_FILTER type =
       (INTERP_FILTER)vp9_read_tree(r, vp9_switchable_interp_tree,
-                                   cm->fc.switchable_interp_prob[ctx]);
+                                   cm->fc->switchable_interp_prob[ctx]);
   if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.switchable_interp[ctx][type];
+    ++counts->switchable_interp[ctx][type];
   return type;
 }
 
-static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
+static void read_intra_block_mode_info(VP9_COMMON *const cm,
+                                       FRAME_COUNTS *counts, MODE_INFO *mi,
                                        vp9_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
@@ -335,24 +368,26 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi,
   switch (bsize) {
     case BLOCK_4X4:
       for (i = 0; i < 4; ++i)
-        mi->bmi[i].as_mode = read_intra_mode_y(cm, r, 0);
+        mi->bmi[i].as_mode = read_intra_mode_y(cm, counts, r, 0);
       mbmi->mode = mi->bmi[3].as_mode;
       break;
     case BLOCK_4X8:
-      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, r, 0);
+      mi->bmi[0].as_mode = mi->bmi[2].as_mode = read_intra_mode_y(cm, counts,
+                                                                  r, 0);
       mi->bmi[1].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode_y(cm, r, 0);
+          read_intra_mode_y(cm, counts, r, 0);
       break;
     case BLOCK_8X4:
-      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, r, 0);
+      mi->bmi[0].as_mode = mi->bmi[1].as_mode = read_intra_mode_y(cm, counts,
+                                                                  r, 0);
       mi->bmi[2].as_mode = mi->bmi[3].as_mode = mbmi->mode =
-          read_intra_mode_y(cm, r, 0);
+          read_intra_mode_y(cm, counts, r, 0);
       break;
     default:
-      mbmi->mode = read_intra_mode_y(cm, r, size_group_lookup[bsize]);
+      mbmi->mode = read_intra_mode_y(cm, counts, r, size_group_lookup[bsize]);
   }
 
-  mbmi->uv_mode = read_intra_mode_uv(cm, r, mbmi->mode);
+  mbmi->uv_mode = read_intra_mode_uv(cm, counts, r, mbmi->mode);
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -360,7 +395,8 @@ static INLINE int is_mv_valid(const MV *mv) {
          mv->col > MV_LOW && mv->col < MV_UPP;
 }
 
-static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
+static INLINE int assign_mv(VP9_COMMON *cm, FRAME_COUNTS *counts,
+                            PREDICTION_MODE mode,
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
                             int is_compound, int allow_hp, vp9_reader *r) {
@@ -370,9 +406,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
   switch (mode) {
     case NEWMV: {
       nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ?
-                                            NULL : &cm->counts.mv;
+                                            NULL : &counts->mv;
       for (i = 0; i < 1 + is_compound; ++i) {
-        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc.nmvc, mv_counts,
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
                 allow_hp);
         ret = ret && is_mv_valid(&mv[i].as_mv);
       }
@@ -404,32 +440,40 @@ static INLINE int assign_mv(VP9_COMMON *cm, PREDICTION_MODE mode,
 }
 
 static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                               FRAME_COUNTS *counts,
                                int segment_id, vp9_reader *r) {
   if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
            INTRA_FRAME;
   } else {
     const int ctx = vp9_get_intra_inter_context(xd);
-    const int is_inter = vp9_read(r, cm->fc.intra_inter_prob[ctx]);
+    const int is_inter = vp9_read(r, cm->fc->intra_inter_prob[ctx]);
     if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.intra_inter[ctx][is_inter];
+      ++counts->intra_inter[ctx][is_inter];
     return is_inter;
   }
 }
 
-static void read_inter_block_mode_info(VP9_COMMON *const cm,
+static void fpm_sync(void *const data, int mi_row) {
+  VP9Decoder *const pbi = (VP9Decoder *)data;
+  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
+                       mi_row << MI_BLOCK_SIZE_LOG2);
+}
+
+static void read_inter_block_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
+                                       FRAME_COUNTS *counts,
                                        const TileInfo *const tile,
                                        MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-
   int_mv nearestmv[2], nearmv[2];
   int inter_mode_ctx, ref, is_compound;
 
-  read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+  read_ref_frames(cm, xd, counts, r, mbmi->segment_id, mbmi->ref_frame);
   is_compound = has_second_ref(mbmi);
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
@@ -437,15 +481,12 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
     xd->block_refs[ref] = ref_buf;
     if ((!vp9_is_valid_scale(&ref_buf->sf)))
-      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+      vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                          "Reference frame has invalid dimensions");
-    if (ref_buf->buf->corrupted)
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Block reference is corrupt");
     vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
                          &ref_buf->sf);
     vp9_find_mv_refs(cm, xd, tile, mi, frame, mbmi->ref_mvs[frame],
-                     mi_row, mi_col);
+                     mi_row, mi_col, fpm_sync, (void *)pbi);
   }
 
   inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
@@ -453,13 +494,13 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
-        vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
                            "Invalid usage of segement feature on small blocks");
         return;
     }
   } else {
     if (bsize >= BLOCK_8X8)
-      mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
+      mbmi->mode = read_inter_mode(cm, counts, r, inter_mode_ctx);
   }
 
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
@@ -470,7 +511,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   }
 
   mbmi->interp_filter = (cm->interp_filter == SWITCHABLE)
-                      ? read_switchable_interp_filter(cm, xd, r)
+                      ? read_switchable_interp_filter(cm, xd, counts, r)
                       : cm->interp_filter;
 
   if (bsize < BLOCK_8X8) {
@@ -483,7 +524,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
         const int j = idy * 2 + idx;
-        b_mode = read_inter_mode(cm, r, inter_mode_ctx);
+        b_mode = read_inter_mode(cm, counts, r, inter_mode_ctx);
 
         if (b_mode == NEARESTMV || b_mode == NEARMV)
           for (ref = 0; ref < 1 + is_compound; ++ref)
@@ -491,7 +532,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
                                           &nearest_sub8x8[ref],
                                           &near_sub8x8[ref]);
 
-        if (!assign_mv(cm, b_mode, block, nearestmv,
+        if (!assign_mv(cm, counts, b_mode, block, nearestmv,
                        nearest_sub8x8, near_sub8x8,
                        is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
@@ -514,38 +555,60 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv,
+    xd->corrupted |= !assign_mv(cm, counts, mbmi->mode, mbmi->mv, nearestmv,
                                 nearestmv, nearmv, is_compound, allow_hp, r);
   }
 }
 
-static void read_inter_frame_mode_info(VP9_COMMON *const cm,
+static void read_inter_frame_mode_info(VP9Decoder *const pbi,
                                        MACROBLOCKD *const xd,
+                                       FRAME_COUNTS *counts,
                                        const TileInfo *const tile,
                                        int mi_row, int mi_col, vp9_reader *r) {
-  MODE_INFO *const mi = xd->mi[0].src_mi;
+  VP9_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   int inter_block;
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
   mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, cm->tx_mode, mbmi->sb_type,
-                               !mbmi->skip || !inter_block, r);
+  mbmi->skip = read_skip(cm, xd, counts, mbmi->segment_id, r);
+  inter_block = read_is_inter_block(cm, xd, counts, mbmi->segment_id, r);
+  mbmi->tx_size = read_tx_size(cm, xd, counts, !mbmi->skip || !inter_block, r);
 
   if (inter_block)
-    read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd, counts, tile, mi, mi_row, mi_col, r);
   else
-    read_intra_block_mode_info(cm, mi, r);
+    read_intra_block_mode_info(cm, counts, mi, r);
 }
 
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        FRAME_COUNTS *counts,
                         const TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MODE_INFO *const mi = xd->mi[0];
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
   if (frame_is_intra_only(cm))
-    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+    read_intra_frame_mode_info(cm, xd, counts, mi_row, mi_col, r);
   else
-    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, xd, counts, tile, mi_row, mi_col, r);
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h
index 7394b62b451..c79dff71888 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.h
@@ -11,6 +11,7 @@
 #ifndef VP9_DECODER_VP9_DECODEMV_H_
 #define VP9_DECODER_VP9_DECODEMV_H_
 
+#include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
@@ -19,7 +20,8 @@ extern "C" {
 
 struct TileInfo;
 
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd,
+                        FRAME_COUNTS *counts,
                         const struct TileInfo *const tile,
                         int mi_row, int mi_col, vp9_reader *r);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c
index baf6ab7ef52..288d8690ca2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.c
@@ -12,9 +12,12 @@
 #include <limits.h>
 #include <stdio.h>
 
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
 
@@ -27,25 +30,53 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/common/vp9_thread.h"
 
 #include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_detokenize.h"
-#include "vp9/decoder/vp9_dthread.h"
 
-static void initialize_dec() {
-  static int init_done = 0;
+static void initialize_dec(void) {
+  static volatile int init_done = 0;
 
   if (!init_done) {
     vp9_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
     vp9_init_intra_predictors();
     init_done = 1;
   }
 }
 
-VP9Decoder *vp9_decoder_create() {
-  VP9Decoder *const pbi = vpx_memalign(32, sizeof(*pbi));
-  VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
+static void vp9_dec_setup_mi(VP9_COMMON *cm) {
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  return 0;
+}
+
+static void vp9_dec_free_mi(VP9_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+}
+
+VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
+  VP9Decoder *volatile const pbi = vpx_memalign(32, sizeof(*pbi));
+  VP9_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
 
   if (!cm)
     return NULL;
@@ -59,21 +90,30 @@ VP9Decoder *vp9_decoder_create() {
   }
 
   cm->error.setjmp = 1;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
+
   pbi->need_resync = 1;
-  initialize_dec();
+  once(initialize_dec);
 
   // Initialize the references to not point to any frame buffers.
-  vpx_memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+  memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
 
   cm->current_video_frame = 0;
   pbi->ready_for_new_data = 1;
+  pbi->common.buffer_pool = pool;
+
   cm->bit_depth = VPX_BITS_8;
   cm->dequant_bit_depth = VPX_BITS_8;
 
-  // vp9_init_dequantizer() is first called here. Add check in
-  // frame_init_dequantizer() to avoid unnecessary calling of
-  // vp9_init_dequantizer() for every frame.
-  vp9_init_dequantizer(cm);
+  cm->alloc_mi = vp9_dec_alloc_mi;
+  cm->free_mi = vp9_dec_free_mi;
+  cm->setup_mi = vp9_dec_setup_mi;
 
   vp9_loop_filter_init(cm);
 
@@ -85,7 +125,6 @@ VP9Decoder *vp9_decoder_create() {
 }
 
 void vp9_decoder_remove(VP9Decoder *pbi) {
-  VP9_COMMON *const cm = &pbi->common;
   int i;
 
   vp9_get_worker_interface()->end(&pbi->lf_worker);
@@ -94,16 +133,15 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
   for (i = 0; i < pbi->num_tile_workers; ++i) {
     VP9Worker *const worker = &pbi->tile_workers[i];
     vp9_get_worker_interface()->end(worker);
-    vpx_free(worker->data1);
-    vpx_free(worker->data2);
   }
+  vpx_free(pbi->tile_worker_data);
+  vpx_free(pbi->tile_worker_info);
   vpx_free(pbi->tile_workers);
 
   if (pbi->num_tile_workers > 0) {
     vp9_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
-  vp9_remove_common(cm);
   vpx_free(pbi);
 }
 
@@ -148,6 +186,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd) {
   RefBuffer *ref_buf = NULL;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
   // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
   // encoder is using the frame buffers for. This is just a stub to keep the
@@ -175,11 +214,11 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
     const int free_fb = get_free_fb(cm);
     // Decrease ref_count since it will be increased again in
     // ref_cnt_fb() below.
-    cm->frame_bufs[free_fb].ref_count--;
+    --frame_bufs[free_fb].ref_count;
 
     // Manage the reference counters and copy image.
-    ref_cnt_fb(cm->frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &cm->frame_bufs[*ref_fb_ptr].buf;
+    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
+    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
     vp8_yv12_copy_frame(sd, ref_buf->buf);
   }
 
@@ -190,33 +229,51 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
 static void swap_frame_buffers(VP9Decoder *pbi) {
   int ref_index = 0, mask;
   VP9_COMMON *const cm = &pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
+  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-    if (mask & 1) {
-      const int old_idx = cm->ref_frame_map[ref_index];
-      ref_cnt_fb(cm->frame_bufs, &cm->ref_frame_map[ref_index],
-                 cm->new_fb_idx);
-      if (old_idx >= 0 && cm->frame_bufs[old_idx].ref_count == 0)
-        cm->release_fb_cb(cm->cb_priv,
-                          &cm->frame_bufs[old_idx].raw_frame_buffer);
+    const int old_idx = cm->ref_frame_map[ref_index];
+    // Current thread releases the holding of reference frame.
+    decrease_ref_count(old_idx, frame_bufs, pool);
+
+    // Release the reference frame in reference map.
+    if ((mask & 1) && old_idx >= 0) {
+      decrease_ref_count(old_idx, frame_bufs, pool);
     }
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
 
+  // Current thread releases the holding of reference frame.
+  for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+    const int old_idx = cm->ref_frame_map[ref_index];
+    decrease_ref_count(old_idx, frame_bufs, pool);
+    cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+  }
+  unlock_buffer_pool(pool);
+  pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+
+  if (!pbi->frame_parallel_decode || !cm->show_frame) {
+    lock_buffer_pool(pool);
+    --frame_bufs[cm->new_fb_idx].ref_count;
+    unlock_buffer_pool(pool);
+  }
 
   // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
-    cm->frame_refs[ref_index].idx = INT_MAX;
+    cm->frame_refs[ref_index].idx = -1;
 }
 
 int vp9_receive_compressed_data(VP9Decoder *pbi,
                                 size_t size, const uint8_t **psource) {
-  VP9_COMMON *const cm = &pbi->common;
+  VP9_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
   const uint8_t *source = *psource;
   int retcode = 0;
-
   cm->error.error_code = VPX_CODEC_OK;
 
   if (size == 0) {
@@ -228,57 +285,120 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
     // TODO(jkoleszar): Error concealment is undefined and non-normative
     // at this point, but if it becomes so, [0] may not always be the correct
     // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX)
+    if (cm->frame_refs[0].idx > 0) {
+      assert(cm->frame_refs[0].buf != NULL);
       cm->frame_refs[0].buf->corrupted = 1;
+    }
   }
 
   pbi->ready_for_new_data = 0;
 
   // Check if the previous frame was a frame without any references to it.
-  if (cm->new_fb_idx >= 0 && cm->frame_bufs[cm->new_fb_idx].ref_count == 0)
-    cm->release_fb_cb(cm->cb_priv,
-                      &cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Release frame buffer if not decoding in frame parallel mode.
+  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0
+      && frame_bufs[cm->new_fb_idx].ref_count == 0)
+    pool->release_fb_cb(pool->cb_priv,
+                        &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
   cm->new_fb_idx = get_free_fb(cm);
 
+  // Assign a MV array to the frame buffer.
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+  pbi->hold_ref_buf = 0;
+  if (pbi->frame_parallel_decode) {
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    vp9_frameworker_lock_stats(worker);
+    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
+    // Reset decoding progress.
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+    pbi->cur_buf->row = -1;
+    pbi->cur_buf->col = -1;
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+  }
+
+
   if (setjmp(cm->error.jmp)) {
-    pbi->need_resync = 1;
+    const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+    int i;
+
     cm->error.setjmp = 0;
-    vp9_clear_system_state();
+    pbi->ready_for_new_data = 1;
 
-    // We do not know if the missing frame(s) was supposed to update
-    // any of the reference buffers, but we act conservative and
-    // mark only the last buffer as corrupted.
-    //
-    // TODO(jkoleszar): Error concealment is undefined and non-normative
-    // at this point, but if it becomes so, [0] may not always be the correct
-    // thing to do here.
-    if (cm->frame_refs[0].idx != INT_MAX && cm->frame_refs[0].buf != NULL)
-      cm->frame_refs[0].buf->corrupted = 1;
+    // Synchronize all threads immediately as a subsequent decode call may
+    // cause a resize invalidating some allocations.
+    winterface->sync(&pbi->lf_worker);
+    for (i = 0; i < pbi->num_tile_workers; ++i) {
+      winterface->sync(&pbi->tile_workers[i]);
+    }
+
+    lock_buffer_pool(pool);
+    // Release all the reference buffers if worker thread is holding them.
+    if (pbi->hold_ref_buf == 1) {
+      int ref_index = 0, mask;
+      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        // Current thread releases the holding of reference frame.
+        decrease_ref_count(old_idx, frame_bufs, pool);
+
+        // Release the reference frame in reference map.
+        if ((mask & 1) && old_idx >= 0) {
+          decrease_ref_count(old_idx, frame_bufs, pool);
+        }
+        ++ref_index;
+      }
 
-    if (cm->new_fb_idx > 0 && cm->frame_bufs[cm->new_fb_idx].ref_count > 0)
-      cm->frame_bufs[cm->new_fb_idx].ref_count--;
+      // Current thread releases the holding of reference frame.
+      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+        const int old_idx = cm->ref_frame_map[ref_index];
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      pbi->hold_ref_buf = 0;
+    }
+    // Release current frame.
+    decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+    unlock_buffer_pool(pool);
 
+    vp9_clear_system_state();
     return -1;
   }
 
   cm->error.setjmp = 1;
-
   vp9_decode_frame(pbi, source, source + size, psource);
 
   swap_frame_buffers(pbi);
 
   vp9_clear_system_state();
 
-  cm->last_width = cm->width;
-  cm->last_height = cm->height;
-
-  if (!cm->show_existing_frame)
+  if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
-  if (cm->show_frame) {
-    if (!cm->show_existing_frame)
-      vp9_swap_mi_and_prev_mi(cm);
+    cm->prev_frame = cm->cur_frame;
+    if (cm->seg.enabled && !pbi->frame_parallel_decode)
+      vp9_swap_current_and_last_seg_map(cm);
+  }
+
+  // Update progress in frame parallel decode.
+  if (pbi->frame_parallel_decode) {
+    // Need to lock the mutex here as another thread may
+    // be accessing this buffer.
+    VP9Worker *const worker = pbi->frame_worker_owner;
+    FrameWorkerData *const frame_worker_data = worker->data1;
+    vp9_frameworker_lock_stats(worker);
 
-    cm->current_video_frame++;
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
+    frame_worker_data->frame_decoded = 1;
+    frame_worker_data->frame_context_ready = 1;
+    vp9_frameworker_signal_stats(worker);
+    vp9_frameworker_unlock_stats(worker);
+  } else {
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+    if (cm->show_frame) {
+      cm->current_video_frame++;
+    }
   }
 
   cm->error.setjmp = 0;
@@ -302,6 +422,8 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
   if (!cm->show_frame)
     return ret;
 
+  pbi->ready_for_new_data = 1;
+
 #if CONFIG_VP9_POSTPROC
   if (!cm->show_existing_frame) {
     ret = vp9_post_proc_frame(cm, sd, flags);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h
index 4f52bb9c473..c19f0ac3bc7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decoder.h
@@ -15,12 +15,12 @@
 
 #include "vpx/vpx_codec.h"
 #include "vpx_scale/yv12config.h"
-
+#include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_thread.h"
-
 #include "vp9/decoder/vp9_dthread.h"
+#include "vp9/decoder/vp9_reader.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,6 +33,14 @@ typedef struct TileData {
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
 } TileData;
 
+typedef struct TileWorkerData {
+  struct VP9Decoder *pbi;
+  vp9_reader bit_reader;
+  FRAME_COUNTS counts;
+  DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  struct vpx_internal_error_info error_info;
+} TileWorkerData;
+
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -44,8 +52,15 @@ typedef struct VP9Decoder {
 
   int frame_parallel_decode;  // frame-based threading.
 
+  // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+  // the same.
+  RefCntBuffer *cur_buf;   //  Current decoding frame buffer.
+
+  VP9Worker *frame_worker_owner;   // frame_worker that owns this pbi.
   VP9Worker lf_worker;
   VP9Worker *tile_workers;
+  TileWorkerData *tile_worker_data;
+  TileInfo *tile_worker_info;
   int num_tile_workers;
 
   TileData *tile_data;
@@ -58,7 +73,8 @@ typedef struct VP9Decoder {
 
   int max_threads;
   int inv_tile_order;
-  int need_resync;  // wait for key/intra-only frame
+  int need_resync;  // wait for key/intra-only frame.
+  int hold_ref_buf;  // hold the reference buffer.
 } VP9Decoder;
 
 int vp9_receive_compressed_data(struct VP9Decoder *pbi,
@@ -75,10 +91,6 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd);
 
-struct VP9Decoder *vp9_decoder_create();
-
-void vp9_decoder_remove(struct VP9Decoder *pbi);
-
 static INLINE uint8_t read_marker(vpx_decrypt_cb decrypt_cb,
                                   void *decrypt_state,
                                   const uint8_t *data) {
@@ -98,6 +110,25 @@ vpx_codec_err_t vp9_parse_superframe_index(const uint8_t *data,
                                            vpx_decrypt_cb decrypt_cb,
                                            void *decrypt_state);
 
+struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
+
+void vp9_decoder_remove(struct VP9Decoder *pbi);
+
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+                                      BufferPool *const pool) {
+  if (idx >= 0) {
+    --frame_bufs[idx].ref_count;
+    // A worker may only get a free framebuffer index when calling get_free_fb.
+    // But the private buffer is not set up until finish decoding header.
+    // So any error happens during decoding header, the frame_bufs will not
+    // have valid priv buffer.
+    if (frame_bufs[idx].ref_count == 0 &&
+        frame_bufs[idx].raw_frame_buffer.priv) {
+      pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+    }
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c
index 421229a28c2..bb8c66fc09f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.c
@@ -14,6 +14,9 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#include "vp9/common/vp9_idct.h"
+#endif
 
 #include "vp9/decoder/vp9_detokenize.h"
 
@@ -32,7 +35,7 @@
 #define INCREMENT_COUNT(token)                              \
   do {                                                      \
      if (!cm->frame_parallel_decoding_mode)                 \
-       ++coef_counts[band][ctx][token];                      \
+       ++coef_counts[band][ctx][token];                     \
   } while (0)
 
 static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
@@ -42,25 +45,14 @@ static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) {
   return val;
 }
 
-static const vp9_tree_index coeff_subtree_high[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2, 6,                                         /* 0 = LOW_VAL */
-  -TWO_TOKEN, 4,                                /* 1 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                    /* 2 = THREE */
-  8, 10,                                        /* 3 = HIGH_LOW */
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,           /* 4 = CAT_ONE */
-  12, 14,                                       /* 5 = CAT_THREEFOUR */
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,           /* 6 = CAT_THREE */
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN            /* 7 = CAT_FIVE */
-};
-
-static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
+static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                        FRAME_COUNTS *counts, PLANE_TYPE type,
                         tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
                         int ctx, const int16_t *scan, const int16_t *nb,
                         vp9_reader *r) {
   const int max_eob = 16 << (tx_size << 1);
-  const FRAME_CONTEXT *const fc = &cm->fc;
-  FRAME_COUNTS *const counts = &cm->counts;
-  const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi);
+  const FRAME_CONTEXT *const fc = cm->fc;
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
   int band, c = 0;
   const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
@@ -144,7 +136,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
       val = 1;
     } else {
       INCREMENT_COUNT(TWO_TOKEN);
-      token = vp9_read_tree(r, coeff_subtree_high,
+      token = vp9_read_tree(r, vp9_coef_con_tree,
                             vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
       switch (token) {
         case TWO_TOKEN:
@@ -191,10 +183,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
     }
     v = (val * dqv) >> dq_shift;
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+    dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v),
+                                          cm->bit_depth);
+#else
     dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
     dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;
-#endif
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     ctx = get_coef_context(nb, token_cache, c);
@@ -205,15 +202,19 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
 }
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                            int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r) {
+                            FRAME_COUNTS *counts, int plane, int block,
+                            BLOCK_SIZE plane_bsize, int x, int y,
+                            TX_SIZE tx_size, vp9_reader *r,
+                            int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int16_t *const dequant = (plane == 0) ? cm->y_dequant[seg_id]
+                                              : cm->uv_dequant[seg_id];
   const int ctx = get_entropy_context(tx_size, pd->above_context + x,
                                                pd->left_context + y);
   const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block);
-  const int eob = decode_coefs(cm, xd, pd->plane_type,
+  const int eob = decode_coefs(cm, xd, counts, pd->plane_type,
                                BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
-                               pd->dequant, ctx, so->scan, so->neighbors, r);
+                               dequant, ctx, so->scan, so->neighbors, r);
   vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   return eob;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h
index 5278e97a302..86126b6a19e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_detokenize.h
@@ -20,8 +20,10 @@ extern "C" {
 #endif
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
-                            int plane, int block, BLOCK_SIZE plane_bsize,
-                            int x, int y, TX_SIZE tx_size, vp9_reader *r);
+                            FRAME_COUNTS *counts, int plane, int block,
+                            BLOCK_SIZE plane_bsize, int x, int y,
+                            TX_SIZE tx_size, vp9_reader *r,
+                            int seg_id);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c
index 69e4fde8586..96a63bd9e14 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.c
@@ -9,265 +9,181 @@
  */
 
 #include "./vpx_config.h"
-
 #include "vpx_mem/vpx_mem.h"
-
 #include "vp9/common/vp9_reconinter.h"
-
 #include "vp9/decoder/vp9_dthread.h"
 #include "vp9/decoder/vp9_decoder.h"
 
-#if CONFIG_MULTITHREAD
-static INLINE void mutex_lock(pthread_mutex_t *const mutex) {
-  const int kMaxTryLocks = 4000;
-  int locked = 0;
-  int i;
-
-  for (i = 0; i < kMaxTryLocks; ++i) {
-    if (!pthread_mutex_trylock(mutex)) {
-      locked = 1;
-      break;
-    }
-  }
+// #define DEBUG_THREAD
 
-  if (!locked)
-    pthread_mutex_lock(mutex);
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void vp9_frameworker_lock_stats(VP9Worker *const worker) {
+#if CONFIG_MULTITHREAD
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+  (void)worker;
+#endif
 }
-#endif  // CONFIG_MULTITHREAD
 
-static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
+void vp9_frameworker_unlock_stats(VP9Worker *const worker) {
 #if CONFIG_MULTITHREAD
-  const int nsync = lf_sync->sync_range;
-
-  if (r && !(c & (nsync - 1))) {
-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
-    mutex_lock(mutex);
-
-    while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
-    }
-    pthread_mutex_unlock(mutex);
-  }
+  FrameWorkerData *const worker_data = worker->data1;
+  pthread_mutex_unlock(&worker_data->stats_mutex);
 #else
-  (void)lf_sync;
-  (void)r;
-  (void)c;
-#endif  // CONFIG_MULTITHREAD
+  (void)worker;
+#endif
 }
 
-static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
-                              const int sb_cols) {
+void vp9_frameworker_signal_stats(VP9Worker *const worker) {
 #if CONFIG_MULTITHREAD
-  const int nsync = lf_sync->sync_range;
-  int cur;
-  // Only signal when there are enough filtered SB for next row to run.
-  int sig = 1;
+  FrameWorkerData *const worker_data = worker->data1;
 
-  if (c < sb_cols - 1) {
-    cur = c;
-    if (c % nsync)
-      sig = 0;
-  } else {
-    cur = sb_cols + nsync;
-  }
-
-  if (sig) {
-    mutex_lock(&lf_sync->mutex_[r]);
-
-    lf_sync->cur_sb_col[r] = cur;
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+  pthread_cond_signal(&worker_data->stats_cond);
+#else
+  pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
 
-    pthread_cond_signal(&lf_sync->cond_[r]);
-    pthread_mutex_unlock(&lf_sync->mutex_[r]);
-  }
 #else
-  (void)lf_sync;
-  (void)r;
-  (void)c;
-  (void)sb_cols;
-#endif  // CONFIG_MULTITHREAD
+  (void)worker;
+#endif
 }
 
-// Implement row loopfiltering for each thread.
-static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
-                                VP9_COMMON *const cm,
-                                struct macroblockd_plane planes[MAX_MB_PLANE],
-                                int start, int stop, int y_only,
-                                VP9LfSync *const lf_sync, int num_lf_workers) {
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  int r, c;  // SB row and col
-  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
-
-  for (r = start; r < stop; r += num_lf_workers) {
-    const int mi_row = r << MI_BLOCK_SIZE_LOG2;
-    MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride;
-
-    for (c = 0; c < sb_cols; ++c) {
-      const int mi_col = c << MI_BLOCK_SIZE_LOG2;
-      LOOP_FILTER_MASK lfm;
-      int plane;
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
 
-      sync_read(lf_sync, r, c);
-
-      vp9_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
-      vp9_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+                          int row) {
+#if CONFIG_MULTITHREAD
+  if (!ref_buf)
+    return;
 
-      for (plane = 0; plane < num_planes; ++plane) {
-        vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm);
-      }
+#ifndef BUILDING_WITH_TSAN
+  // The following line of code will get harmless tsan error but it is the key
+  // to get best performance.
+  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
 
-      sync_write(lf_sync, r, c, sb_cols);
+  {
+    // Find the worker thread that owns the reference frame. If the reference
+    // frame has been fully decoded, it may not have owner.
+    VP9Worker *const ref_worker = ref_buf->frame_worker_owner;
+    FrameWorkerData *const ref_worker_data =
+        (FrameWorkerData *)ref_worker->data1;
+    const VP9Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+    {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
+             worker_data->worker_id, worker, ref_worker_data->worker_id,
+             ref_buf->frame_worker_owner, row, ref_buf->row);
     }
-  }
-}
-
-// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(TileWorkerData *const tile_data,
-                                  void *unused) {
-  LFWorkerData *const lf_data = &tile_data->lfdata;
-  (void)unused;
-  loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
-                      lf_data->start, lf_data->stop, lf_data->y_only,
-                      lf_data->lf_sync, lf_data->num_lf_workers);
-  return 1;
-}
-
-// VP9 decoder: Implement multi-threaded loopfilter that uses the tile
-// threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              VP9Decoder *pbi, VP9_COMMON *cm,
-                              int frame_filter_level,
-                              int y_only) {
-  VP9LfSync *const lf_sync = &pbi->lf_row_sync;
-  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
-  // Number of superblock rows and cols
-  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
-  int i;
-
-  if (!frame_filter_level) return;
-
-  if (!lf_sync->sync_range || cm->last_height != cm->height) {
-    vp9_loop_filter_dealloc(lf_sync);
-    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
-  }
-
-  vp9_loop_filter_frame_init(cm, frame_filter_level);
-
-  // Initialize cur_sb_col to -1 for all SB rows.
-  vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
-
-  // Set up loopfilter thread data.
-  // The decoder is using num_workers instead of pbi->num_tile_workers
-  // because it has been observed that using more threads on the
-  // loopfilter, than there are tile columns in the frame will hurt
-  // performance on Android. This is because the system will only
-  // schedule the tile decode workers on cores equal to the number
-  // of tile columns. Then if the decoder tries to use more threads for the
-  // loopfilter, it will hurt performance because of contention. If the
-  // multithreading code changes in the future then the number of workers
-  // used by the loopfilter should be revisited.
-  for (i = 0; i < num_workers; ++i) {
-    VP9Worker *const worker = &pbi->tile_workers[i];
-    TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
-    LFWorkerData *const lf_data = &tile_data->lfdata;
-
-    worker->hook = (VP9WorkerHook)loop_filter_row_worker;
+#endif
 
-    // Loopfilter data
-    lf_data->frame_buffer = frame;
-    lf_data->cm = cm;
-    vp9_copy(lf_data->planes, pbi->mb.plane);
-    lf_data->start = i;
-    lf_data->stop = sb_rows;
-    lf_data->y_only = y_only;   // always do all planes in decoder
-
-    lf_data->lf_sync = lf_sync;
-    lf_data->num_lf_workers = num_workers;
-
-    // Start loopfiltering
-    if (i == num_workers - 1) {
-      winterface->execute(worker);
-    } else {
-      winterface->launch(worker);
+    vp9_frameworker_lock_stats(ref_worker);
+    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+           ref_buf->buf.corrupted != 1) {
+      pthread_cond_wait(&ref_worker_data->stats_cond,
+                        &ref_worker_data->stats_mutex);
     }
-  }
 
-  // Wait till all rows are finished
-  for (i = 0; i < num_workers; ++i) {
-    winterface->sync(&pbi->tile_workers[i]);
+    if (ref_buf->buf.corrupted == 1) {
+      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+      vp9_frameworker_unlock_stats(ref_worker);
+      vpx_internal_error(&worker_data->pbi->common.error,
+                         VPX_CODEC_CORRUPT_FRAME,
+                         "Worker %p failed to decode frame", worker);
+    }
+    vp9_frameworker_unlock_stats(ref_worker);
   }
+#else
+  (void)worker;
+  (void)ref_buf;
+  (void)row;
+  (void)ref_buf;
+#endif  // CONFIG_MULTITHREAD
 }
 
-// Set up nsync by width.
-static int get_sync_range(int width) {
-  // nsync numbers are picked by testing. For example, for 4k
-  // video, using 4 gives best performance.
-  if (width < 640)
-    return 1;
-  else if (width <= 1280)
-    return 2;
-  else if (width <= 4096)
-    return 4;
-  else
-    return 8;
-}
-
-// Allocate memory for lf row synchronization
-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
-                           int width) {
-  lf_sync->rows = rows;
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
 #if CONFIG_MULTITHREAD
-  {
-    int i;
-
-    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
-                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
-    if (lf_sync->mutex_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
-      }
-    }
+  VP9Worker *worker = buf->frame_worker_owner;
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond_,
-                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
-    if (lf_sync->cond_) {
-      for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&lf_sync->cond_[i], NULL);
-      }
-    }
+#ifdef DEBUG_THREAD
+  {
+    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+           buf->frame_worker_owner, row);
   }
-#endif  // CONFIG_MULTITHREAD
+#endif
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
-                  vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
-
-  // Set up nsync.
-  lf_sync->sync_range = get_sync_range(width);
+  vp9_frameworker_lock_stats(worker);
+  buf->row = row;
+  vp9_frameworker_signal_stats(worker);
+  vp9_frameworker_unlock_stats(worker);
+#else
+  (void)buf;
+  (void)row;
+#endif  // CONFIG_MULTITHREAD
 }
 
-// Deallocate lf synchronization related mutex and data
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
-  if (lf_sync != NULL) {
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                  VP9Worker *const src_worker) {
 #if CONFIG_MULTITHREAD
-    int i;
+  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
+  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+  int i;
 
-    if (lf_sync->mutex_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_mutex_destroy(&lf_sync->mutex_[i]);
-      }
-      vpx_free(lf_sync->mutex_);
-    }
-    if (lf_sync->cond_ != NULL) {
-      for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_cond_destroy(&lf_sync->cond_[i]);
-      }
-      vpx_free(lf_sync->cond_);
-    }
-#endif  // CONFIG_MULTITHREAD
-    vpx_free(lf_sync->cur_sb_col);
-    // clear the structure as the source of this call may be a resize in which
-    // case this call will be followed by an _alloc() which may fail.
-    vp9_zero(*lf_sync);
+  // Wait until source frame's context is ready.
+  vp9_frameworker_lock_stats(src_worker);
+  while (!src_worker_data->frame_context_ready) {
+    pthread_cond_wait(&src_worker_data->stats_cond,
+        &src_worker_data->stats_mutex);
   }
+
+  dst_cm->last_frame_seg_map = src_cm->seg.enabled ?
+      src_cm->current_frame_seg_map : src_cm->last_frame_seg_map;
+  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+  vp9_frameworker_unlock_stats(src_worker);
+
+  dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
+  dst_cm->prev_frame = src_cm->show_existing_frame ?
+                       src_cm->prev_frame : src_cm->cur_frame;
+  dst_cm->last_width = !src_cm->show_existing_frame ?
+                       src_cm->width : src_cm->last_width;
+  dst_cm->last_height = !src_cm->show_existing_frame ?
+                        src_cm->height : src_cm->last_height;
+  dst_cm->subsampling_x = src_cm->subsampling_x;
+  dst_cm->subsampling_y = src_cm->subsampling_y;
+  dst_cm->frame_type = src_cm->frame_type;
+  dst_cm->last_show_frame = !src_cm->show_existing_frame ?
+                            src_cm->show_frame : src_cm->last_show_frame;
+  for (i = 0; i < REF_FRAMES; ++i)
+    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
+  dst_cm->lf.filter_level = src_cm->lf.filter_level;
+  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
+  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+  dst_cm->seg = src_cm->seg;
+  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+  (void) dst_worker;
+  (void) src_worker;
+#endif  // CONFIG_MULTITHREAD
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h
index b1fbdeb74a0..979cb3d8bd6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_dthread.h
@@ -13,46 +13,54 @@
 
 #include "./vpx_config.h"
 #include "vp9/common/vp9_thread.h"
-#include "vp9/decoder/vp9_reader.h"
+#include "vpx/internal/vpx_codec_internal.h"
 
 struct VP9Common;
 struct VP9Decoder;
 
-typedef struct TileWorkerData {
-  struct VP9Common *cm;
-  vp9_reader bit_reader;
-  DECLARE_ALIGNED(16, struct macroblockd, xd);
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+  struct VP9Decoder *pbi;
+  const uint8_t *data;
+  const uint8_t *data_end;
+  size_t data_size;
+  void *user_priv;
+  int result;
+  int worker_id;
+  int received_frame;
 
-  // Row-based parallel loopfilter data
-  LFWorkerData lfdata;
-} TileWorkerData;
+  // scratch_buffer is used in frame parallel mode only.
+  // It is used to make a copy of the compressed data.
+  uint8_t *scratch_buffer;
+  size_t scratch_buffer_size;
 
-// Loopfilter row synchronization
-typedef struct VP9LfSyncData {
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
+  pthread_mutex_t stats_mutex;
+  pthread_cond_t stats_cond;
 #endif
-  // Allocate memory to store the loop-filtered superblock index in each row.
-  int *cur_sb_col;
-  // The optimal sync_range for different resolution and platform should be
-  // determined by testing. Currently, it is chosen to be a power-of-2 number.
-  int sync_range;
-  int rows;
-} VP9LfSync;
-
-// Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
-                           int width);
-
-// Deallocate loopfilter synchronization related mutex and data.
-void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
-
-// Multi-threaded loopfilter that uses the tile threads.
-void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
-                              struct VP9Decoder *pbi,
-                              struct VP9Common *cm,
-                              int frame_filter_level,
-                              int y_only);
+
+  int frame_context_ready;  // Current frame's context is ready to read.
+  int frame_decoded;        // Finished decoding current frame.
+} FrameWorkerData;
+
+void vp9_frameworker_lock_stats(VP9Worker *const worker);
+void vp9_frameworker_unlock_stats(VP9Worker *const worker);
+void vp9_frameworker_signal_stats(VP9Worker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void vp9_frameworker_wait(VP9Worker *const worker, RefCntBuffer *const ref_buf,
+                          int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void vp9_frameworker_copy_context(VP9Worker *const dst_worker,
+                                  VP9Worker *const src_worker);
 
 #endif  // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c
index 3eef72844c1..c3b38a9c710 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_read_bit_buffer.c
@@ -10,20 +10,20 @@
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 
 size_t vp9_rb_bytes_read(struct vp9_read_bit_buffer *rb) {
-  return (rb->bit_offset + CHAR_BIT - 1) / CHAR_BIT;
+  return (rb->bit_offset + 7) >> 3;
 }
 
 int vp9_rb_read_bit(struct vp9_read_bit_buffer *rb) {
   const size_t off = rb->bit_offset;
-  const size_t p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - (int)off % CHAR_BIT;
-  if (rb->bit_buffer + p >= rb->bit_buffer_end) {
-    rb->error_handler(rb->error_handler_data);
-    return 0;
-  } else {
-    const int bit = (rb->bit_buffer[p] & (1 << q)) >> q;
+  const size_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
     rb->bit_offset = off + 1;
     return bit;
+  } else {
+    rb->error_handler(rb->error_handler_data);
+    return 0;
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h
index 2d9eccfbf93..a68a1d5925b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_reader.h
@@ -30,14 +30,15 @@ typedef size_t BD_VALUE;
 #define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
 
 typedef struct {
-  const uint8_t *buffer_end;
-  const uint8_t *buffer;
-  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
+  // Be careful when reordering this struct, it may impact the cache negatively.
   BD_VALUE value;
-  int count;
   unsigned int range;
+  int count;
+  const uint8_t *buffer_end;
+  const uint8_t *buffer;
   vpx_decrypt_cb decrypt_cb;
   void *decrypt_state;
+  uint8_t clear_buffer[sizeof(BD_VALUE) + 1];
 } vp9_reader;
 
 int vp9_reader_init(vp9_reader *r,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c
new file mode 100644
index 00000000000..f505fcb7ac0
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
+  const uint32x4_t a = vpaddlq_u16(v_16x8);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
+  uint8x8_t v_s0 = vld1_u8(s);
+  const uint8x8_t v_s1 = vld1_u8(s + p);
+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
+
+  v_s0 = vld1_u8(s + 2 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 3 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 4 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 5 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 6 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  v_s0 = vld1_u8(s + 7 * p);
+  v_sum = vaddw_u8(v_sum, v_s0);
+
+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
index 6c66f5d5bc9..a6d4797adae 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -32,6 +32,24 @@ void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
   }
 }
 
+void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr,
+                            int16_t* qcoeff_ptr, int16_t* dqcoeff_ptr,
+                            const int16_t* dequant_ptr, uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  int16_t temp_buffer[64];
+  (void)coeff_ptr;
+
+  vp9_fdct8x8_neon(input, temp_buffer, stride);
+  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, zbin_ptr, round_ptr,
+                       quant_ptr, quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+                       dequant_ptr, eob_ptr, scan_ptr, iscan_ptr);
+}
+
 void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
   int i;
   // stage 1
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 8c13d0da672..47363c75ba5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -26,13 +26,12 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
                           int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          int zbin_oq_value, uint16_t *eob_ptr,
+                          uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)scan;
 
   if (!skip_block) {
@@ -112,8 +111,8 @@ void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
       *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
     }
   } else {
-    vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-    vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+    memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
     *eob_ptr = 0;
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
index 816fbda1fbe..cf82dd75d92 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
 
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
@@ -19,18 +20,6 @@
 
 #include "vp9/encoder/vp9_variance.h"
 
-enum { kWidth8 = 8 };
-enum { kHeight8 = 8 };
-enum { kHeight8PlusOne = 9 };
-enum { kWidth16 = 16 };
-enum { kHeight16 = 16 };
-enum { kHeight16PlusOne = 17 };
-enum { kWidth32 = 32 };
-enum { kHeight32 = 32 };
-enum { kHeight32PlusOne = 33 };
-enum { kPixelStepOne = 1 };
-enum { kAlign16 = 16 };
-
 static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
   const int32x4_t a = vpaddlq_s16(v_16x8);
   const int64x2_t b = vpaddlq_s32(a);
@@ -46,9 +35,10 @@ static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
   return vget_lane_s32(c, 0);
 }
 
+// w * h must be less than 2048 or local variable v_sum may overflow.
 static void variance_neon_w8(const uint8_t *a, int a_stride,
                              const uint8_t *b, int b_stride,
-                             int w, int h, unsigned int *sse, int *sum) {
+                             int w, int h, uint32_t *sse, int *sum) {
   int i, j;
   int16x8_t v_sum = vdupq_n_s16(0);
   int32x4_t v_sse_lo = vdupq_n_s32(0);
@@ -79,31 +69,31 @@ static void variance_neon_w8(const uint8_t *a, int a_stride,
 void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth8,
-                   kHeight8, sse, sum);
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
+                   8, sse, sum);
 }
 
 unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
                                   const uint8_t *b, int b_stride,
                                   unsigned int *sse) {
   int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth8, kHeight8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth8 * kHeight8));
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
 }
 
 void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
                           const uint8_t *ref_ptr, int ref_stride,
                           unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
-                   kHeight16, sse, sum);
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
+                   16, sse, sum);
 }
 
 unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
                                     const uint8_t *b, int b_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
 }
 
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
@@ -164,15 +154,15 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
                                             const uint8_t *dst,
                                             int dst_stride,
                                             unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight8 * kWidth8);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight8PlusOne * kWidth8);
+  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
 
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, kPixelStepOne,
-                            kHeight8PlusOne, kWidth8,
+  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
+                            9, 8,
                             BILINEAR_FILTERS_2TAP(xoffset));
-  var_filter_block2d_bil_w8(fdata3, temp2, kWidth8, kWidth8, kHeight8,
-                            kWidth8, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance8x8_neon(temp2, kWidth8, dst, dst_stride, sse);
+  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
+                            8, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
 
 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
@@ -182,30 +172,85 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
                                               const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight16 * kWidth16);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
 
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
-                             kHeight16PlusOne, kWidth16,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             17, 16,
                              BILINEAR_FILTERS_2TAP(xoffset));
-  var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
-                             kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
+                             16, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }
 
 void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
                           const uint8_t *ref_ptr, int ref_stride,
                           unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth32,
-                   kHeight32, sse, sum);
+  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
+                   32, sse, sum);
 }
 
 unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
                                     const uint8_t *b, int b_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, kWidth32, kHeight32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (kWidth32 * kHeight32));
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride,
+                   b + (32 * b_stride), b_stride, 32, 32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
+                   b + (16 * 2 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
+                   b + (16 * 3 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
 }
 
 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@@ -215,13 +260,31 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
                                               const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, temp2, kHeight32 * kWidth32);
-  DECLARE_ALIGNED_ARRAY(kAlign16, uint8_t, fdata3, kHeight32PlusOne * kWidth32);
+  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
+
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             33, 32,
+                             BILINEAR_FILTERS_2TAP(xoffset));
+  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
+                             32, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+}
+
+unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
+                                              int src_stride,
+                                              int xoffset,
+                                              int yoffset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
 
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
-                             kHeight32PlusOne, kWidth32,
+  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
+                             65, 64,
                              BILINEAR_FILTERS_2TAP(xoffset));
-  var_filter_block2d_bil_w16(fdata3, temp2, kWidth32, kWidth32, kHeight32,
-                             kWidth32, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance32x32_neon(temp2, kWidth32, dst, dst_stride, sse);
+  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
+                             64, BILINEAR_FILTERS_2TAP(yoffset));
+  return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
index f7fca0cde0a..9622ba1d67e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.c
@@ -11,22 +11,34 @@
 #include <limits.h>
 #include <math.h>
 
+#include "vp9/encoder/vp9_aq_variance.h"
+#include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/common/vp9_seg_common.h"
-
 #include "vp9/encoder/vp9_segmentation.h"
 
-#define AQ_C_SEGMENTS  3
-#define AQ_C_STRENGTHS  3
-static const int aq_c_active_segments[AQ_C_STRENGTHS] = {1, 2, 3};
+#define AQ_C_SEGMENTS  5
+#define DEFAULT_AQ2_SEG 3   // Neutral Q segment
+#define AQ_C_STRENGTHS 3
 static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
-  {{1.0, 1.0, 1.0}, {1.0, 2.0, 1.0}, {1.0, 1.5, 2.5}};
+  { {1.75, 1.25, 1.05, 1.00, 0.90},
+    {2.00, 1.50, 1.15, 1.00, 0.85},
+    {2.50, 1.75, 1.25, 1.00, 0.80} };
 static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
-  {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}};
+  { {0.15, 0.30, 0.55, 2.00, 100.0},
+    {0.20, 0.40, 0.65, 2.00, 100.0},
+    {0.25, 0.50, 0.75, 2.00, 100.0} };
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] =
+  { {-4.0, -3.0, -2.0, 100.00, 100.0},
+    {-3.5, -2.5, -1.5, 100.00, 100.0},
+    {-3.0, -2.0, -1.0, 100.00, 100.0} };
+
+#define DEFAULT_COMPLEXITY 64
+
 
 static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) {
   // Approximate base quatizer (truncated to int)
   const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4;
-  return (base_quant > 20) + (base_quant > 45);
+  return (base_quant > 10) + (base_quant > 25);
 }
 
 void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
@@ -41,13 +53,9 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
     int segment;
     const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
-    const int active_segments = aq_c_active_segments[aq_strength];
 
     // Clear down the segment map.
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-
-    // Clear down the complexity map used for rd.
-    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
 
     vp9_clearall_segfeatures(seg);
 
@@ -63,15 +71,21 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
     // Select delta coding method.
     seg->abs_delta = SEGMENT_DELTADATA;
 
-    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
-    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+    // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
 
     // Use some of the segments for in frame Q adjustment.
-    for (segment = 1; segment < active_segments; ++segment) {
-      int qindex_delta =
-          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
-                                     aq_c_q_adj_factor[aq_strength][segment],
-                                     cm->bit_depth);
+    for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+      int qindex_delta;
+
+      if (segment == DEFAULT_AQ2_SEG)
+        continue;
+
+      qindex_delta =
+        vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                   aq_c_q_adj_factor[aq_strength][segment],
+                                   cm->bit_depth);
+
 
       // For AQ complexity mode, we dont allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
@@ -88,55 +102,54 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
   }
 }
 
-// Select a segment for the current SB64 block.
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+#define VAR_STRENGTH_STEP 0.25
+// Select a segment for the current block.
 // The choice of segment for a block depends on the ratio of the projected
-// bits for the block vs a target average.
-// An "aq_strength" value determines how many segments are supported,
-// the set of transition points to use and the extent of the quantizer
-// adjustment for each segment (configured in vp9_setup_in_frame_q_adj()).
-void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
-                                   int mi_row, int mi_col,
-                                   int output_enabled, int projected_rate) {
+// bits for the block vs a target average and its spatial complexity.
+void vp9_caq_select_segment(VP9_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate) {
   VP9_COMMON *const cm = &cpi->common;
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
   const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
   const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
-  const int xmis = MIN(cm->mi_cols - mi_col, bw);
-  const int ymis = MIN(cm->mi_rows - mi_row, bh);
-  int complexity_metric = 64;
+  const int xmis = MIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
+  const int ymis = MIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
   int x, y;
-
+  int i;
   unsigned char segment;
 
-  if (!output_enabled) {
-    segment = 0;
+  if (0) {
+    segment = DEFAULT_AQ2_SEG;
   } else {
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
     // It is converted to bits * 256 units.
     const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
                             (bw * bh);
+    double logvar;
+    double low_var_thresh;
     const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
-    const int active_segments = aq_c_active_segments[aq_strength];
-
-    // The number of segments considered and the transition points used to
-    // select them is determined by the "aq_strength" value.
-    // Currently this loop only supports segments that reduce Q (i.e. where
-    // there is undershoot.
-    // The loop counts down towards segment 0 which is the default segment
-    // with no Q adjustment.
-    segment = active_segments - 1;
-    while (segment > 0) {
-      if (projected_rate <
-          (target_rate * aq_c_transitions[aq_strength][segment])) {
+
+    vp9_clear_system_state();
+    low_var_thresh = (cpi->oxcf.pass == 2)
+      ? MAX(cpi->twopass.mb_av_energy, MIN_DEFAULT_LV_THRESH)
+      : DEFAULT_LV_THRESH;
+
+    vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+    logvar = vp9_log_block_var(cpi, mb, bs);
+
+    segment = AQ_C_SEGMENTS - 1;    // Just in case no break out below.
+    for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+      // Test rate against a threshold value and variance against a threshold.
+      // Increasing segment number (higher variance and complexity) = higher Q.
+      if ((projected_rate <
+           target_rate * aq_c_transitions[aq_strength][i]) &&
+          (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+        segment = i;
         break;
       }
-      --segment;
-    }
-
-    if (target_rate > 0) {
-      complexity_metric =
-        clamp((int)((projected_rate * 64) / target_rate), 16, 255);
     }
   }
 
@@ -144,8 +157,6 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi,
   for (y = 0; y < ymis; y++) {
     for (x = 0; x < xmis; x++) {
       cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
-      cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
-        (unsigned char)complexity_metric;
     }
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
index af031a46c6c..c0dce6c5b7d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -17,11 +17,12 @@ extern "C" {
 #endif
 
 struct VP9_COMP;
+struct macroblock;
 
-// Select a segment for the current SB64.
-void vp9_select_in_frame_q_segment(struct VP9_COMP *cpi, int mi_row, int mi_col,
-                                   int output_enabled, int projected_rate);
-
+// Select a segment for the current Block.
+void vp9_caq_select_segment(struct VP9_COMP *cpi, struct macroblock *,
+                            BLOCK_SIZE bs,
+                            int mi_row, int mi_col, int projected_rate);
 
 // This function sets up a set of segments with delta Q values around
 // the baseline frame quantizer.
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 514ff7a52ad..24b427df575 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -19,31 +19,38 @@
 #include "vp9/encoder/vp9_segmentation.h"
 
 struct CYCLIC_REFRESH {
-  // Percentage of super-blocks per frame that are targeted as candidates
+  // Percentage of blocks per frame that are targeted as candidates
   // for cyclic refresh.
-  int max_sbs_perframe;
+  int percent_refresh;
   // Maximum q-delta as percentage of base q.
   int max_qdelta_perc;
-  // Block size below which we don't apply cyclic refresh.
-  BLOCK_SIZE min_block_size;
   // Superblock starting index for cycling through the frame.
   int sb_index;
-  // Controls how long a block will need to wait to be refreshed again.
+  // Controls how long block will need to wait to be refreshed again, in
+  // excess of the cycle time, i.e., in the case of all zero motion, block
+  // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
   int time_for_refresh;
-  // Actual number of (8x8) blocks that were applied delta-q (segment 1).
-  int num_seg_blocks;
-  // Actual encoding bits for segment 1.
-  int actual_seg_bits;
+  // Target number of (8x8) blocks that are set for delta-q.
+  int target_num_seg_blocks;
+  // Actual number of (8x8) blocks that were applied delta-q.
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
   // RD mult. parameters for segment 1.
   int rdmult;
   // Cyclic refresh map.
   signed char *map;
-  // Projected rate and distortion for the current superblock.
-  int64_t projected_rate_sb;
-  int64_t projected_dist_sb;
-  // Thresholds applied to projected rate/distortion of the superblock.
+  // Thresholds applied to the projected rate/distortion of the coding block,
+  // when deciding whether block should be refreshed.
   int64_t thresh_rate_sb;
   int64_t thresh_dist_sb;
+  // Threshold applied to the motion vector (in units of 1/8 pel) of the
+  // coding block, when deciding whether block should be refreshed.
+  int16_t motion_thresh;
+  // Rate target ratio to set q delta.
+  double rate_ratio_qdelta;
+  double low_content_avg;
+  int qindex_delta_seg1;
+  int qindex_delta_seg2;
 };
 
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
@@ -73,10 +80,10 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
   // with number of seg blocks, so compare available bits to number of blocks.
   // Average bits available per frame = avg_frame_bandwidth
   // Number of (8x8) blocks in frame = mi_rows * mi_cols;
-  const float factor  = 0.5;
+  const float factor = 0.25;
   const int number_blocks = cm->mi_rows  * cm->mi_cols;
   // The condition below corresponds to turning off at target bitrates:
-  // ~24kbps for CIF, 72kbps for VGA (at 30fps).
+  // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
   // Also turn off at very small frame sizes, to avoid too large fraction of
   // superblocks to be refreshed per frame. Threshold below is less than QCIF.
   if (rc->avg_frame_bandwidth < factor * number_blocks ||
@@ -92,33 +99,98 @@ static int apply_cyclic_refresh_bitrate(const VP9_COMMON *cm,
 // mode, and rate/distortion.
 static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
                                 const MB_MODE_INFO *mbmi,
-                                BLOCK_SIZE bsize, int use_rd) {
-  if (use_rd) {
-    // If projected rate is below the thresh_rate (well below target,
-    // so undershoot expected), accept it for lower-qp coding.
-    if (cr->projected_rate_sb < cr->thresh_rate_sb)
-      return 1;
-    // Otherwise, reject the block for lower-qp coding if any of the following:
-    // 1) prediction block size is below min_block_size
-    // 2) mode is non-zero mv and projected distortion is above thresh_dist
-    // 3) mode is an intra-mode (we may want to allow some of this under
-    // another thresh_dist)
-    else if (bsize < cr->min_block_size ||
-             (mbmi->mv[0].as_int != 0 &&
-              cr->projected_dist_sb > cr->thresh_dist_sb) ||
-             !is_inter_block(mbmi))
-      return 0;
-    else
-      return 1;
-  } else {
-    // Rate/distortion not used for update.
-    if (bsize < cr->min_block_size ||
-        mbmi->mv[0].as_int != 0 ||
-        !is_inter_block(mbmi))
-      return 0;
-    else
-      return 1;
+                                int64_t rate,
+                                int64_t dist,
+                                int bsize) {
+  MV mv = mbmi->mv[0].as_mv;
+  // Reject the block for lower-qp coding if projected distortion
+  // is above the threshold, and any of the following is true:
+  // 1) mode uses large mv
+  // 2) mode is an intra-mode
+  // Otherwise accept for refresh.
+  if (dist > cr->thresh_dist_sb &&
+      (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+       mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+       !is_inter_block(mbmi)))
+    return CR_SEGMENT_ID_BASE;
+  else  if (bsize >= BLOCK_16X16 &&
+            rate < cr->thresh_rate_sb &&
+            is_inter_block(mbmi) &&
+            mbmi->mv[0].as_int == 0)
+    // More aggressive delta-q for bigger blocks with zero motion.
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const VP9_COMP *cpi, int q, double rate_factor) {
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int deltaq = vp9_compute_qdelta_by_rate(rc, cpi->common.frame_type,
+                                          q, rate_factor,
+                                          cpi->common.bit_depth);
+  if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+    deltaq = -cr->max_qdelta_perc * q / 100;
   }
+  return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int vp9_cyclic_refresh_estimate_bits_at_q(const VP9_COMP *cpi,
+                                          double correction_factor) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int estimated_bits;
+  int mbs = cm->MBs;
+  int num8x8bl = mbs << 2;
+  // Weight for non-base segments: use actual number of blocks refreshed in
+  // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+  double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+  double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+  // Take segment weighted average for estimated bits.
+  estimated_bits = (int)((1.0 - weight_segment1 - weight_segment2) *
+      vp9_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment1 *
+      vp9_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta_seg1, mbs,
+                             correction_factor, cm->bit_depth) +
+                             weight_segment2 *
+      vp9_estimate_bits_at_q(cm->frame_type,
+                             cm->base_qindex + cr->qindex_delta_seg2, mbs,
+                             correction_factor, cm->bit_depth));
+  return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i,
+                                      double correction_factor) {
+  const VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int bits_per_mb;
+  int num8x8bl = cm->MBs << 2;
+  // Weight for segment prior to encoding: take the average of the target
+  // number for the frame to be encoded and the actual from the previous frame.
+  double weight_segment = (double)((cr->target_num_seg_blocks +
+      cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) /
+      num8x8bl;
+  // Compute delta-q corresponding to qindex i.
+  int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+  // Take segment weighted average for bits per mb.
+  bits_per_mb = (int)((1.0 - weight_segment) *
+      vp9_rc_bits_per_mb(cm->frame_type, i, correction_factor, cm->bit_depth) +
+      weight_segment *
+      vp9_rc_bits_per_mb(cm->frame_type, i + deltaq, correction_factor,
+                         cm->bit_depth));
+  return bits_per_mb;
 }
 
 // Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
@@ -127,7 +199,10 @@ static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
 void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
                                        MB_MODE_INFO *const mbmi,
                                        int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize, int use_rd) {
+                                       BLOCK_SIZE bsize,
+                                       int64_t rate,
+                                       int64_t dist,
+                                       int skip) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   const int bw = num_8x8_blocks_wide_lookup[bsize];
@@ -135,21 +210,26 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   const int block_index = mi_row * cm->mi_cols + mi_col;
-  const int refresh_this_block = cpi->mb.in_static_area ||
-                                 candidate_refresh_aq(cr, mbmi, bsize, use_rd);
+  const int refresh_this_block = candidate_refresh_aq(cr, mbmi, rate, dist,
+                                                      bsize);
   // Default is to not update the refresh map.
   int new_map_value = cr->map[block_index];
   int x = 0; int y = 0;
 
-  // Check if we should reset the segment_id for this block.
-  if (mbmi->segment_id > 0 && !refresh_this_block)
-    mbmi->segment_id = 0;
+  // If this block is labeled for refresh, check if we should reset the
+  // segment_id.
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+    mbmi->segment_id = refresh_this_block;
+    // Reset segment_id if will be skipped.
+    if (skip)
+      mbmi->segment_id = CR_SEGMENT_ID_BASE;
+  }
 
   // Update the cyclic refresh map, to be used for setting segmentation map
   // for the next frame. If the block  will be refreshed this frame, mark it
   // as clean. The magnitude of the -ve influences how long before we consider
   // it for refresh again.
-  if (mbmi->segment_id == 1) {
+  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
     new_map_value = -cr->time_for_refresh;
   } else if (refresh_this_block) {
     // Else if it is accepted as candidate for refresh, and has not already
@@ -161,6 +241,7 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
     // Leave it marked as block that is not candidate for refresh.
     new_map_value = 1;
   }
+
   // Update entries in the cyclic refresh map with new_map_value, and
   // copy mbmi->segment_id into global segmentation map.
   for (y = 0; y < ymis; y++)
@@ -169,10 +250,188 @@ void vp9_cyclic_refresh_update_segment(VP9_COMP *const cpi,
       cpi->segmentation_map[block_index + y * cm->mi_cols + x] =
           mbmi->segment_id;
     }
-  // Keep track of actual number (in units of 8x8) of blocks in segment 1 used
-  // for encoding this frame.
-  if (mbmi->segment_id)
-    cr->num_seg_blocks += xmis * ymis;
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int mi_row, mi_col;
+  cr->actual_num_seg1_blocks = 0;
+  cr->actual_num_seg2_blocks = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+      if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST1)
+        cr->actual_num_seg1_blocks++;
+      else if (cyclic_refresh_segment_id(
+          seg_map[mi_row * cm->mi_cols + mi_col]) == CR_SEGMENT_ID_BOOST2)
+        cr->actual_num_seg2_blocks++;
+    }
+}
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+  // period. Depending on past encoding stats, GF flag may be reset and update
+  // may not occur until next baseline_gf_interval.
+  if (cr->percent_refresh > 0)
+    rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+  else
+    rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void vp9_cyclic_refresh_check_golden_update(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  int mi_row, mi_col;
+  double fraction_low = 0.0;
+  int low_content_frame = 0;
+
+  MODE_INFO **mi = cm->mi_grid_visible;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int rows = cm->mi_rows, cols = cm->mi_cols;
+  int cnt1 = 0, cnt2 = 0;
+  int force_gf_refresh = 0;
+
+  for (mi_row = 0; mi_row < rows; mi_row++) {
+    for (mi_col = 0; mi_col < cols; mi_col++) {
+      int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 ?
+          mi[0]->mbmi.mv[0].as_mv.row : -1 * mi[0]->mbmi.mv[0].as_mv.row;
+      int16_t abs_mvc = mi[0]->mbmi.mv[0].as_mv.col >= 0 ?
+          mi[0]->mbmi.mv[0].as_mv.col : -1 * mi[0]->mbmi.mv[0].as_mv.col;
+
+      // Calculate the motion of the background.
+      if (abs_mvr <= 16 && abs_mvc <= 16) {
+        cnt1++;
+        if (abs_mvr == 0 && abs_mvc == 0)
+          cnt2++;
+      }
+      mi++;
+
+      // Accumulate low_content_frame.
+      if (cr->map[mi_row * cols + mi_col] < 1)
+        low_content_frame++;
+    }
+    mi += 8;
+  }
+
+  // For video conference clips, if the background has high motion in current
+  // frame because of the camera movement, set this frame as the golden frame.
+  // Use 70% and 5% as the thresholds for golden frame refreshing.
+  if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
+    vp9_cyclic_refresh_set_golden_update(cpi);
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    force_gf_refresh = 1;
+  }
+
+  fraction_low =
+      (double)low_content_frame / (rows * cols);
+  // Update average.
+  cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+  if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+    // Don't update golden reference if the amount of low_content for the
+    // current encoded frame is small, or if the recursive average of the
+    // low_content over the update interval window falls below threshold.
+    if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+      cpi->refresh_golden_frame = 0;
+    // Reset for next internal.
+    cr->low_content_avg = fraction_low;
+  }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+void vp9_cyclic_refresh_update_map(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+  int xmis, ymis, x, y;
+  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+  sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sbs_in_frame = sb_cols * sb_rows;
+  // Number of target blocks to get the q delta (segment 1).
+  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+  // Set the segmentation map: cycle through the superblocks, starting at
+  // cr->mb_index, and stopping when either block_count blocks have been found
+  // to be refreshed, or we have passed through whole frame.
+  assert(cr->sb_index < sbs_in_frame);
+  i = cr->sb_index;
+  cr->target_num_seg_blocks = 0;
+  do {
+    int sum_map = 0;
+    // Get the mi_row/mi_col corresponding to superblock index i.
+    int sb_row_index = (i / sb_cols);
+    int sb_col_index = i - sb_row_index * sb_cols;
+    int mi_row = sb_row_index * MI_BLOCK_SIZE;
+    int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    assert(mi_row >= 0 && mi_row < cm->mi_rows);
+    assert(mi_col >= 0 && mi_col < cm->mi_cols);
+    bl_index = mi_row * cm->mi_cols + mi_col;
+    // Loop through all 8x8 blocks in superblock and update map.
+    xmis = MIN(cm->mi_cols - mi_col,
+               num_8x8_blocks_wide_lookup[BLOCK_64X64]);
+    ymis = MIN(cm->mi_rows - mi_row,
+               num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    for (y = 0; y < ymis; y++) {
+      for (x = 0; x < xmis; x++) {
+        const int bl_index2 = bl_index + y * cm->mi_cols + x;
+        // If the block is as a candidate for clean up then mark it
+        // for possible boost/refresh (segment 1). The segment id may get
+        // reset to 0 later if block gets coded anything other than ZEROMV.
+        if (cr->map[bl_index2] == 0) {
+          sum_map++;
+        } else if (cr->map[bl_index2] < 0) {
+          cr->map[bl_index2]++;
+        }
+      }
+    }
+    // Enforce constant segment over superblock.
+    // If segment is at least half of superblock, set to 1.
+    if (sum_map >= xmis * ymis / 2) {
+      for (y = 0; y < ymis; y++)
+        for (x = 0; x < xmis; x++) {
+          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+        }
+      cr->target_num_seg_blocks += xmis * ymis;
+    }
+    i++;
+    if (i == sbs_in_frame) {
+      i = 0;
+    }
+  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+  cr->sb_index = i;
+}
+
+// Set/update global/frame level cyclic refresh parameters.
+void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  cr->percent_refresh = 10;
+  // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+  // periods of the refresh cycle, after a key frame. This corresponds to ~40
+  // frames with cr->percent_refresh = 10.
+  if (rc->frames_since_key <  40)
+    cr->rate_ratio_qdelta = 3.0;
+  else
+    cr->rate_ratio_qdelta = 2.0;
 }
 
 // Setup cyclic background refresh: set delta q and segmentation map.
@@ -181,47 +440,38 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
-  unsigned char *const seg_map = cpi->segmentation_map;
   const int apply_cyclic_refresh  = apply_cyclic_refresh_bitrate(cm, rc);
+  if (cm->current_video_frame == 0)
+    cr->low_content_avg = 0.0;
   // Don't apply refresh on key frame or enhancement layer frames.
   if (!apply_cyclic_refresh ||
       (cm->frame_type == KEY_FRAME) ||
-      (cpi->svc.temporal_layer_id > 0)) {
+      (cpi->svc.temporal_layer_id > 0) ||
+      (cpi->svc.spatial_layer_id > 0)) {
     // Set segmentation map to 0 and disable.
-    vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+    unsigned char *const seg_map = cpi->segmentation_map;
+    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
     if (cm->frame_type == KEY_FRAME)
       cr->sb_index = 0;
     return;
   } else {
     int qindex_delta = 0;
-    int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
-    int xmis, ymis, x, y, qindex2;
-
-    // Rate target ratio to set q delta.
-    const float rate_ratio_qdelta = 2.0;
+    int qindex2;
     const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
     vp9_clear_system_state();
-    // Some of these parameters may be set via codec-control function later.
-    cr->max_sbs_perframe = 10;
     cr->max_qdelta_perc = 50;
-    cr->min_block_size = BLOCK_8X8;
-    cr->time_for_refresh = 1;
-    // Set rate threshold to some fraction of target (and scaled by 256).
-    cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 2;
+    cr->time_for_refresh = 0;
+    // Set rate threshold to some multiple (set to 2 for now) of the target
+    // rate (target is given by sb64_target_rate and scaled by 256).
+    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
     // Distortion threshold, quadratic in Q, scale factor to be adjusted.
-    cr->thresh_dist_sb = 8 * (int)(q * q);
-    if (cpi->sf.use_nonrd_pick_mode) {
-      // May want to be more conservative with thresholds in non-rd mode for now
-      // as rate/distortion are derived from model based on prediction residual.
-      cr->thresh_rate_sb = (rc->sb64_target_rate * 256) >> 3;
-      cr->thresh_dist_sb = 4 * (int)(q * q);
-    }
-
-    cr->num_seg_blocks = 0;
+    // q will not exceed 457, so (q * q) is within 32bit; see:
+    // vp9_convert_qindex_to_q(), vp9_ac_quant(), ac_qlookup*[].
+    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+    cr->motion_thresh = 32;
     // Set up segmentation.
     // Clear down the segment map.
-    vpx_memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_enable_segmentation(&cm->seg);
     vp9_clearall_segfeatures(seg);
     // Select delta coding method.
@@ -234,89 +484,34 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     // relative to 0 previous map.
     // seg->temporal_update = 0;
 
-    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q.
-    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
-    // Use segment 1 for in-frame Q adjustment.
-    vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
-
-    // Set the q delta for segment 1.
-    qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
-                                              cm->base_qindex,
-                                              rate_ratio_qdelta,
-                                              cm->bit_depth);
-    // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from
-    // previous encoded frame.
-    if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100)
-      qindex_delta = -cr->max_qdelta_perc * cm->base_qindex / 100;
-
-    // Compute rd-mult for segment 1.
+    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+    vp9_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+    // Use segment BOOST1 for in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+    vp9_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+    // Set the q delta for segment BOOST1.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+    cr->qindex_delta_seg1 = qindex_delta;
+
+    // Compute rd-mult for segment BOOST1.
     qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
     cr->rdmult = vp9_compute_rd_mult(cpi, qindex2);
 
-    vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qindex_delta);
-
-    sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-    sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-    sbs_in_frame = sb_cols * sb_rows;
-    // Number of target superblocks to get the q delta (segment 1).
-    block_count = cr->max_sbs_perframe * sbs_in_frame / 100;
-    // Set the segmentation map: cycle through the superblocks, starting at
-    // cr->mb_index, and stopping when either block_count blocks have been found
-    // to be refreshed, or we have passed through whole frame.
-    assert(cr->sb_index < sbs_in_frame);
-    i = cr->sb_index;
-    do {
-      int sum_map = 0;
-      // Get the mi_row/mi_col corresponding to superblock index i.
-      int sb_row_index = (i / sb_cols);
-      int sb_col_index = i - sb_row_index * sb_cols;
-      int mi_row = sb_row_index * MI_BLOCK_SIZE;
-      int mi_col = sb_col_index * MI_BLOCK_SIZE;
-      assert(mi_row >= 0 && mi_row < cm->mi_rows);
-      assert(mi_col >= 0 && mi_col < cm->mi_cols);
-      bl_index = mi_row * cm->mi_cols + mi_col;
-      // Loop through all 8x8 blocks in superblock and update map.
-      xmis = MIN(cm->mi_cols - mi_col,
-                 num_8x8_blocks_wide_lookup[BLOCK_64X64]);
-      ymis = MIN(cm->mi_rows - mi_row,
-                 num_8x8_blocks_high_lookup[BLOCK_64X64]);
-      for (y = 0; y < ymis; y++) {
-        for (x = 0; x < xmis; x++) {
-          const int bl_index2 = bl_index + y * cm->mi_cols + x;
-          // If the block is as a candidate for clean up then mark it
-          // for possible boost/refresh (segment 1). The segment id may get
-          // reset to 0 later if block gets coded anything other than ZEROMV.
-          if (cr->map[bl_index2] == 0) {
-            seg_map[bl_index2] = 1;
-            sum_map++;
-          } else if (cr->map[bl_index2] < 0) {
-            cr->map[bl_index2]++;
-          }
-        }
-      }
-      // Enforce constant segment over superblock.
-      // If segment is partial over superblock, reset to either all 1 or 0.
-      if (sum_map > 0 && sum_map < xmis * ymis) {
-        const int new_value = (sum_map >= xmis * ymis / 2);
-        for (y = 0; y < ymis; y++)
-          for (x = 0; x < xmis; x++)
-            seg_map[bl_index + y * cm->mi_cols + x] = new_value;
-      }
-      i++;
-      if (i == sbs_in_frame) {
-        i = 0;
-      }
-      if (sum_map >= xmis * ymis /2)
-        block_count--;
-    } while (block_count && i != cr->sb_index);
-    cr->sb_index = i;
-  }
-}
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
 
-void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
-                                             int64_t rate_sb, int64_t dist_sb) {
-  cr->projected_rate_sb = rate_sb;
-  cr->projected_dist_sb = dist_sb;
+    // Set a more aggressive (higher) q delta for segment BOOST2.
+    qindex_delta = compute_deltaq(cpi, cm->base_qindex,
+                                  MIN(CR_MAX_RATE_TARGET_RATIO,
+                                      CR_BOOST2_FAC * cr->rate_ratio_qdelta));
+    cr->qindex_delta_seg2 = qindex_delta;
+    vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+    // Update the segmentation and refresh map.
+    vp9_cyclic_refresh_update_map(cpi);
+  }
 }
 
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index f556d658bdc..21f114b5e59 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -18,6 +18,18 @@
 extern "C" {
 #endif
 
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE    0
+#define CR_SEGMENT_ID_BOOST1  1
+#define CR_SEGMENT_ID_BOOST2  2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+// Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+#define CR_BOOST2_FAC 1.7
+
 struct VP9_COMP;
 
 struct CYCLIC_REFRESH;
@@ -27,22 +39,59 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols);
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr);
 
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int vp9_cyclic_refresh_estimate_bits_at_q(const struct VP9_COMP *cpi,
+                                          double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int vp9_cyclic_refresh_rc_bits_per_mb(const struct VP9_COMP *cpi, int i,
+                                      double correction_factor);
+
 // Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
 // check if we should reset the segment_id, and update the cyclic_refresh map
 // and segmentation map.
 void vp9_cyclic_refresh_update_segment(struct VP9_COMP *const cpi,
                                        MB_MODE_INFO *const mbmi,
-                                       int mi_row, int mi_col,
-                                       BLOCK_SIZE bsize, int use_rd);
+                                       int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                       int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void vp9_cyclic_refresh_postencode(struct VP9_COMP *const cpi);
+
+// Set golden frame update interval, for non-svc 1 pass CBR mode.
+void vp9_cyclic_refresh_set_golden_update(struct VP9_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void vp9_cyclic_refresh_check_golden_update(struct VP9_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void vp9_cyclic_refresh_update_parameters(struct VP9_COMP *const cpi);
 
 // Setup cyclic background refresh: set delta q and segmentation map.
 void vp9_cyclic_refresh_setup(struct VP9_COMP *const cpi);
 
-void vp9_cyclic_refresh_set_rate_and_dist_sb(CYCLIC_REFRESH *cr,
-                                             int64_t rate_sb, int64_t dist_sb);
-
 int vp9_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
 
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+  return segment_id == CR_SEGMENT_ID_BOOST1 ||
+         segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+  if (segment_id == CR_SEGMENT_ID_BOOST1)
+    return CR_SEGMENT_ID_BOOST1;
+  else if (segment_id == CR_SEGMENT_ID_BOOST2)
+    return CR_SEGMENT_ID_BOOST2;
+  else
+    return CR_SEGMENT_ID_BASE;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
index 7d75f09a418..be6f7e4ee53 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -19,18 +19,16 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/common/vp9_systemdependent.h"
 
-#define ENERGY_MIN (-1)
+#define ENERGY_MIN (-4)
 #define ENERGY_MAX (1)
 #define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN +  1)
 #define ENERGY_IN_BOUNDS(energy)\
   assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
 
-static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 };
-static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 };
+static const double rate_ratio[MAX_SEGMENTS] =
+  {2.5, 2.0, 1.5, 1.0, 0.75, 1.0, 1.0, 1.0};
+static const int segment_id[ENERGY_SPAN] = {0, 1, 1, 2, 3, 4};
 
-#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN]
-#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN]
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
 DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0};
@@ -40,47 +38,12 @@ DECLARE_ALIGNED(16, static const uint16_t, vp9_highbd_64_zeros[64]) = {0};
 
 unsigned int vp9_vaq_segment_id(int energy) {
   ENERGY_IN_BOUNDS(energy);
-
   return SEGMENT_ID(energy);
 }
 
-double vp9_vaq_rdmult_ratio(int energy) {
-  ENERGY_IN_BOUNDS(energy);
-
-  vp9_clear_system_state();
-
-  return RDMULT_RATIO(energy);
-}
-
-double vp9_vaq_inv_q_ratio(int energy) {
-  ENERGY_IN_BOUNDS(energy);
-
-  vp9_clear_system_state();
-
-  return Q_RATIO(-energy);
-}
-
-void vp9_vaq_init() {
-  int i;
-  double base_ratio;
-
-  assert(ENERGY_SPAN <= MAX_SEGMENTS);
-
-  vp9_clear_system_state();
-
-  base_ratio = 1.5;
-
-  for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-    Q_RATIO(i) = pow(base_ratio, i/3.0);
-  }
-}
-
 void vp9_vaq_frame_setup(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
-  const double base_q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
-  const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex +
-                                              cm->y_dc_delta_q);
   int i;
 
   if (cm->frame_type == KEY_FRAME ||
@@ -91,26 +54,28 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) {
 
     seg->abs_delta = SEGMENT_DELTADATA;
 
-  vp9_clear_system_state();
+    vp9_clear_system_state();
 
-    for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) {
-      int qindex_delta, segment_rdmult;
+    for (i = 0; i < MAX_SEGMENTS; ++i) {
+      int qindex_delta =
+          vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex,
+                                     rate_ratio[i], cm->bit_depth);
 
-      if (Q_RATIO(i) == 1) {
-        // No need to enable SEG_LVL_ALT_Q for this segment
-        RDMULT_RATIO(i) = 1;
-        continue;
+      // We don't allow qindex 0 in a segment if the base value is not 0.
+      // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+      // Q delta is sometimes applied without going back around the rd loop.
+      // This could lead to an illegal combination of partition size and q.
+      if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+        qindex_delta = -cm->base_qindex + 1;
       }
 
-      qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i),
-                                        cm->bit_depth);
-      vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta);
-      vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q);
-
-      segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta +
-                                           cm->y_dc_delta_q);
+      // No need to enable SEG_LVL_ALT_Q for this segment.
+      if (rate_ratio[i] == 1.0) {
+        continue;
+      }
 
-      RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult;
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
     }
   }
 }
@@ -167,12 +132,19 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
-  double energy;
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   unsigned int var = block_variance(cpi, x, bs);
-
   vp9_clear_system_state();
+  return log(var + 1.0);
+}
 
-  energy = 0.9 * (log(var + 1.0) - 10.0);
+#define DEFAULT_E_MIDPOINT 10.0
+int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+  double energy;
+  double energy_midpoint;
+  vp9_clear_system_state();
+  energy_midpoint =
+    (cpi->oxcf.pass == 2) ? cpi->twopass.mb_av_energy : DEFAULT_E_MIDPOINT;
+  energy = vp9_log_block_var(cpi, x, bs) - energy_midpoint;
   return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
index d1a459fe9ec..a0effa31165 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -19,13 +19,10 @@ extern "C" {
 #endif
 
 unsigned int vp9_vaq_segment_id(int energy);
-double vp9_vaq_rdmult_ratio(int energy);
-double vp9_vaq_inv_q_ratio(int energy);
-
-void vp9_vaq_init();
 void vp9_vaq_frame_setup(VP9_COMP *cpi);
 
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c
index e9810c894d3..95b13bb7718 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_avg.c
@@ -19,6 +19,156 @@ unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {
   return (sum + 32) >> 6;
 }
 
+unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+                        int16_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, coeff);
+    coeff += 8;
+    ++tmp_buf;
+  }
+}
+
+// In place 16x16 2D Hadamard transform
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = a0 + a1;
+    int16_t b1 = a0 - a1;
+    int16_t b2 = a2 + a3;
+    int16_t b3 = a2 - a3;
+
+    coeff[0]   = (b0 + b2) >> 1;
+    coeff[64]  = (b1 + b3) >> 1;
+    coeff[128] = (b0 - b2) >> 1;
+    coeff[192] = (b1 - b3) >> 1;
+
+    ++coeff;
+  }
+}
+
+int16_t vp9_satd_c(const int16_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i)
+    satd += abs(coeff[i]);
+
+  return (int16_t)satd;
+}
+
+// Integer projection onto row vectors.
+void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
+                       const int ref_stride, const int height) {
+  int idx;
+  const int norm_factor = MAX(8, height >> 1);
+  for (idx = 0; idx < 16; ++idx) {
+    int i;
+    hbuf[idx] = 0;
+    for (i = 0; i < height; ++i)
+      hbuf[idx] += ref[i * ref_stride];
+    hbuf[idx] /= norm_factor;
+    ++ref;
+  }
+}
+
+int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {
+  int idx;
+  int16_t sum = 0;
+  for (idx = 0; idx < width; ++idx)
+    sum += ref[idx];
+  return sum;
+}
+
+int vp9_vector_var_c(int16_t const *ref, int16_t const *src,
+                     const int bwl) {
+  int i;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];
+    mean += diff;
+    sse += diff * diff;
+  }
+
+  var = sse - ((mean * mean) >> (bwl + 2));
+  return var;
+}
+
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
@@ -29,5 +179,32 @@ unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
 
   return (sum + 32) >> 6;
 }
+
+unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s+=p)
+    for (j = 0; j < 4; sum += s[j], ++j) {}
+
+  return (sum + 8) >> 4;
+}
+
+void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+                             int dp, int *min, int *max) {
+  int i, j;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
index 421e049697a..d20e067669f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c
@@ -34,17 +34,15 @@
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
-static struct vp9_token intra_mode_encodings[INTRA_MODES];
-static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
-static struct vp9_token partition_encodings[PARTITION_TYPES];
-static struct vp9_token inter_mode_encodings[INTER_MODES];
-
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
-  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
-  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
-}
+static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
+  {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
+  {62, 6}, {2, 2}};
+static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
+  {{0, 1}, {2, 2}, {3, 2}};
+static const struct vp9_token partition_encodings[PARTITION_TYPES] =
+  {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
+static const struct vp9_token inter_mode_encodings[INTER_MODES] =
+  {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
 
 static void write_intra_mode(vp9_writer *w, PREDICTION_MODE mode,
                              const vp9_prob *probs) {
@@ -79,12 +77,12 @@ static void prob_diff_update(const vp9_tree_index *tree,
 }
 
 static void write_selected_tx_size(const VP9_COMMON *cm,
-                                   const MACROBLOCKD *xd,
-                                   TX_SIZE tx_size, BLOCK_SIZE bsize,
-                                   vp9_writer *w) {
+                                   const MACROBLOCKD *xd, vp9_writer *w) {
+  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
-                                                 &cm->fc.tx_probs);
+                                                 &cm->fc->tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -104,19 +102,21 @@ static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   }
 }
 
-static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_skip_probs(VP9_COMMON *cm, vp9_writer *w,
+                              FRAME_COUNTS *counts) {
   int k;
 
   for (k = 0; k < SKIP_CONTEXTS; ++k)
-    vp9_cond_prob_diff_update(w, &cm->fc.skip_probs[k], cm->counts.skip[k]);
+    vp9_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
 }
 
-static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void update_switchable_interp_probs(VP9_COMMON *cm, vp9_writer *w,
+                                           FRAME_COUNTS *counts) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     prob_diff_update(vp9_switchable_interp_tree,
-                     cm->fc.switchable_interp_prob[j],
-                     cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
+                     cm->fc->switchable_interp_prob[j],
+                     counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
 static void pack_mb_tokens(vp9_writer *w,
@@ -201,7 +201,7 @@ static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
 // This function encodes the reference frame
 static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                              vp9_writer *w) {
-  const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
 
@@ -237,8 +237,8 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd,
 static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
                                 vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
-  const nmv_context *nmvc = &cm->fc.nmvc;
-  const MACROBLOCK *const x = &cpi->mb;
+  const nmv_context *nmvc = &cm->fc->nmvc;
+  const MACROBLOCK *const x = &cpi->td.mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct segmentation *const seg = &cm->seg;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -268,14 +268,13 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
     vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(is_inter &&
-        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
+      !(is_inter && skip)) {
+    write_selected_tx_size(cm, xd, w);
   }
 
   if (!is_inter) {
     if (bsize >= BLOCK_8X8) {
-      write_intra_mode(w, mode, cm->fc.y_mode_prob[size_group_lookup[bsize]]);
+      write_intra_mode(w, mode, cm->fc->y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -283,28 +282,27 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
       for (idy = 0; idy < 2; idy += num_4x4_h) {
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const PREDICTION_MODE b_mode = mi->bmi[idy * 2 + idx].as_mode;
-          write_intra_mode(w, b_mode, cm->fc.y_mode_prob[0]);
+          write_intra_mode(w, b_mode, cm->fc->y_mode_prob[0]);
         }
       }
     }
-    write_intra_mode(w, mbmi->uv_mode, cm->fc.uv_mode_prob[mode]);
+    write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
   } else {
     const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
-    const vp9_prob *const inter_probs = cm->fc.inter_mode_probs[mode_ctx];
+    const vp9_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
     write_ref_frames(cm, xd, w);
 
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
         write_inter_mode(w, mode, inter_probs);
-        ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(mode)];
       }
     }
 
     if (cm->interp_filter == SWITCHABLE) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
       vp9_write_token(w, vp9_switchable_interp_tree,
-                      cm->fc.switchable_interp_prob[ctx],
+                      cm->fc->switchable_interp_prob[ctx],
                       &switchable_interp_encodings[mbmi->interp_filter]);
       ++cpi->interp_filter_selected[0][mbmi->interp_filter];
     } else {
@@ -320,7 +318,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
           write_inter_mode(w, b_mode, inter_probs);
-          ++cm->counts.inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
           if (b_mode == NEWMV) {
             for (ref = 0; ref < 1 + is_compound; ++ref)
               vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
@@ -341,12 +338,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi,
 }
 
 static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO *mi_8x8, vp9_writer *w) {
+                              MODE_INFO **mi_8x8, vp9_writer *w) {
   const struct segmentation *const seg = &cm->seg;
-  const MODE_INFO *const mi = mi_8x8;
-  const MODE_INFO *const above_mi = mi_8x8[-xd->mi_stride].src_mi;
-  const MODE_INFO *const left_mi =
-      xd->left_available ? mi_8x8[-1].src_mi : NULL;
+  const MODE_INFO *const mi = mi_8x8[0];
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
 
@@ -356,7 +352,7 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   write_skip(cm, xd, mbmi->segment_id, mi, w);
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT)
-    write_selected_tx_size(cm, xd, mbmi->tx_size, bsize, w);
+    write_selected_tx_size(cm, xd, w);
 
   if (bsize >= BLOCK_8X8) {
     write_intra_mode(w, mbmi->mode, get_y_mode_probs(mi, above_mi, left_mi, 0));
@@ -382,11 +378,11 @@ static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
                           const TOKENEXTRA *const tok_end,
                           int mi_row, int mi_col) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
 
-  xd->mi = cm->mi + (mi_row * cm->mi_stride + mi_col);
-  m = xd->mi;
+  xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+  m = xd->mi[0];
 
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
@@ -429,7 +425,7 @@ static void write_modes_sb(VP9_COMP *cpi,
                            TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
   const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
@@ -440,7 +436,7 @@ static void write_modes_sb(VP9_COMP *cpi,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  m = cm->mi[mi_row * cm->mi_stride + mi_col].src_mi;
+  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
   write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
@@ -485,11 +481,12 @@ static void write_modes_sb(VP9_COMP *cpi,
 static void write_modes(VP9_COMP *cpi,
                         const TileInfo *const tile, vp9_writer *w,
                         TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int mi_row, mi_col;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
        mi_row += MI_BLOCK_SIZE) {
-    vp9_zero(cpi->mb.e_mbd.left_seg_context);
+    vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE)
       write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
@@ -500,7 +497,7 @@ static void write_modes(VP9_COMP *cpi,
 static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size,
                                     vp9_coeff_stats *coef_branch_ct,
                                     vp9_coeff_probs_model *coef_probs) {
-  vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
+  vp9_coeff_count *coef_counts = cpi->td.rd_counts.coef_counts[tx_size];
   unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size];
   int i, j, k, l, m;
@@ -528,10 +525,12 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                                      TX_SIZE tx_size,
                                      vp9_coeff_stats *frame_branch_ct,
                                      vp9_coeff_probs_model *new_coef_probs) {
-  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc.coef_probs[tx_size];
+  vp9_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
   const vp9_prob upd = DIFF_UPDATE_PROB;
   const int entropy_nodes_update = UNCONSTRAINED_NODES;
   int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
       /* dry run to see if there is any update at all needed */
@@ -549,7 +548,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd);
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t], oldp, &newp, upd);
@@ -587,7 +586,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
-                      old_coef_probs[i][j][k][l], &newp, upd);
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
                 else
                   s = vp9_prob_diff_update_savings_search(
                       frame_branch_ct[i][j][k][l][t],
@@ -608,14 +607,7 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
       return;
     }
 
-    case ONE_LOOP:
     case ONE_LOOP_REDUCED: {
-      const int prev_coef_contexts_to_update =
-          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
-              COEFF_CONTEXTS >> 1 : COEFF_CONTEXTS;
-      const int coef_band_to_update =
-          cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED ?
-              COEF_BANDS >> 1 : COEF_BANDS;
       int updates = 0;
       int noupdates_before_first = 0;
       for (i = 0; i < PLANE_TYPES; ++i) {
@@ -628,21 +620,19 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 vp9_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 int s;
                 int u = 0;
-                if (l >= prev_coef_contexts_to_update ||
-                    k >= coef_band_to_update) {
-                  u = 0;
+
+                if (t == PIVOT_NODE) {
+                  s = vp9_prob_diff_update_savings_search_model(
+                      frame_branch_ct[i][j][k][l][0],
+                      old_coef_probs[i][j][k][l], &newp, upd, stepsize);
                 } else {
-                  if (t == PIVOT_NODE)
-                    s = vp9_prob_diff_update_savings_search_model(
-                        frame_branch_ct[i][j][k][l][0],
-                        old_coef_probs[i][j][k][l], &newp, upd);
-                  else
-                    s = vp9_prob_diff_update_savings_search(
-                        frame_branch_ct[i][j][k][l][t],
-                        *oldp, &newp, upd);
-                  if (s > 0 && newp != *oldp)
-                    u = 1;
+                  s = vp9_prob_diff_update_savings_search(
+                      frame_branch_ct[i][j][k][l][t],
+                      *oldp, &newp, upd);
                 }
+
+                if (s > 0 && newp != *oldp)
+                  u = 1;
                 updates += u;
                 if (u == 0 && updates == 0) {
                   noupdates_before_first++;
@@ -671,7 +661,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
       }
       return;
     }
-
     default:
       assert(0);
   }
@@ -681,16 +670,19 @@ static void update_coef_probs(VP9_COMP *cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
-  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
-  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
-
-  for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
-    build_tree_distribution(cpi, tx_size, frame_branch_ct[tx_size],
-                            frame_coef_probs[tx_size]);
-
-  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
-    update_coef_probs_common(w, cpi, tx_size, frame_branch_ct[tx_size],
-                             frame_coef_probs[tx_size]);
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
+    vp9_coeff_stats frame_branch_ct[PLANE_TYPES];
+    vp9_coeff_probs_model frame_coef_probs[PLANE_TYPES];
+    if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+        (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
+      vp9_write_bit(w, 0);
+    } else {
+      build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                              frame_coef_probs);
+      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                               frame_coef_probs);
+    }
+  }
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
@@ -813,7 +805,8 @@ static void encode_segmentation(VP9_COMMON *cm, MACROBLOCKD *xd,
   }
 }
 
-static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
+static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w,
+                              FRAME_COUNTS *counts) {
   // Mode
   vp9_write_literal(w, MIN(cm->tx_mode, ALLOW_32X32), 2);
   if (cm->tx_mode >= ALLOW_32X32)
@@ -828,22 +821,22 @@ static void encode_txfm_probs(VP9_COMMON *cm, vp9_writer *w) {
 
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i], ct_8x8p);
+      tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j], ct_8x8p[j]);
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i], ct_16x16p);
+      tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
                                   ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
+      tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; j++)
-        vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
+        vp9_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
                                   ct_32x32p[j]);
     }
   }
@@ -858,7 +851,7 @@ static void write_interp_filter(INTERP_FILTER filter,
     vp9_wb_write_literal(wb, filter_to_literal[filter], 2);
 }
 
-static void fix_interp_filter(VP9_COMMON *cm) {
+static void fix_interp_filter(VP9_COMMON *cm, FRAME_COUNTS *counts) {
   if (cm->interp_filter == SWITCHABLE) {
     // Check to see if only one of the filters is actually used
     int count[SWITCHABLE_FILTERS];
@@ -866,7 +859,7 @@ static void fix_interp_filter(VP9_COMMON *cm) {
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
       count[i] = 0;
       for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
-        count[i] += cm->counts.switchable_interp[j][i];
+        count[i] += counts->switchable_interp[j][i];
       c += (count[i] > 0);
     }
     if (c == 1) {
@@ -929,42 +922,31 @@ static int get_refresh_mask(VP9_COMP *cpi) {
 static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
   VP9_COMMON *const cm = &cpi->common;
   vp9_writer residual_bc;
-
   int tile_row, tile_col;
-  TOKENEXTRA *tok[4][1 << 6], *tok_end;
+  TOKENEXTRA *tok_end;
   size_t total_size = 0;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
-  TileInfo tile[4][1 << 6];
-  TOKENEXTRA *pre_tok = cpi->tok;
-  int tile_tok = 0;
 
-  vpx_memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) *
-             mi_cols_aligned_to_sb(cm->mi_cols));
-
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
-
-      tok[tile_row][tile_col] = pre_tok + tile_tok;
-      pre_tok = tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
-    }
-  }
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
 
   for (tile_row = 0; tile_row < tile_rows; tile_row++) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      const TileInfo * const ptile = &tile[tile_row][tile_col];
+      int tile_idx = tile_row * tile_cols + tile_col;
+      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
 
-      tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
+      tok_end = cpi->tile_tok[tile_row][tile_col] +
+          cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
         vp9_start_encode(&residual_bc, data_ptr + total_size + 4);
       else
         vp9_start_encode(&residual_bc, data_ptr + total_size);
 
-      write_modes(cpi, ptile, &residual_bc, &tok[tile_row][tile_col], tok_end);
-      assert(tok[tile_row][tile_col] == tok_end);
+      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
+                  &residual_bc, &tok, tok_end);
+      assert(tok == tok_end);
       vp9_stop_encode(&residual_bc);
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
         // size of this tile
@@ -1006,8 +988,6 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
   MV_REFERENCE_FRAME ref_frame;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
-    found = cm->width == cfg->y_crop_width &&
-            cm->height == cfg->y_crop_height;
 
     // Set "found" to 0 for temporal svc and for spatial svc key frame
     if (cpi->use_svc &&
@@ -1020,6 +1000,9 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
          cpi->svc.layer_context[0].frames_from_key_frame <
          cpi->svc.number_temporal_layers + 1))) {
       found = 0;
+    } else if (cfg != NULL) {
+      found = cm->width == cfg->y_crop_width &&
+              cm->height == cfg->y_crop_height;
     }
     vp9_wb_write_bit(wb, found);
     if (found) {
@@ -1068,7 +1051,7 @@ static void write_bitdepth_colorspace_sampling(
     vp9_wb_write_bit(wb, cm->bit_depth == VPX_BITS_10 ? 0 : 1);
   }
   vp9_wb_write_literal(wb, cm->color_space, 3);
-  if (cm->color_space != SRGB) {
+  if (cm->color_space != VPX_CS_SRGB) {
     vp9_wb_write_bit(wb, 0);  // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
     if (cm->profile == PROFILE_1 || cm->profile == PROFILE_3) {
       assert(cm->subsampling_x != 1 || cm->subsampling_y != 1);
@@ -1087,6 +1070,7 @@ static void write_bitdepth_colorspace_sampling(
 static void write_uncompressed_header(VP9_COMP *cpi,
                                       struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
   vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
 
@@ -1130,7 +1114,8 @@ static void write_uncompressed_header(VP9_COMP *cpi,
       MV_REFERENCE_FRAME ref_frame;
       vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-        vp9_wb_write_literal(wb, get_ref_frame_idx(cpi, ref_frame),
+        assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+        vp9_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
                              REF_FRAMES_LOG2);
         vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[ref_frame]);
       }
@@ -1139,7 +1124,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
       vp9_wb_write_bit(wb, cm->allow_high_precision_mv);
 
-      fix_interp_filter(cm);
+      fix_interp_filter(cm, cpi->td.counts);
       write_interp_filter(cm->interp_filter, wb);
     }
   }
@@ -1153,15 +1138,16 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
-  encode_segmentation(cm, &cpi->mb.e_mbd, wb);
+  encode_segmentation(cm, xd, wb);
 
   write_tile_info(cm, wb);
 }
 
 static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  FRAME_CONTEXT *const fc = &cm->fc;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  FRAME_CONTEXT *const fc = cm->fc;
+  FRAME_COUNTS *counts = cpi->td.counts;
   vp9_writer header_bc;
 
   vp9_start_encode(&header_bc, data);
@@ -1169,28 +1155,26 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   if (xd->lossless)
     cm->tx_mode = ONLY_4X4;
   else
-    encode_txfm_probs(cm, &header_bc);
+    encode_txfm_probs(cm, &header_bc, counts);
 
   update_coef_probs(cpi, &header_bc);
-  update_skip_probs(cm, &header_bc);
+  update_skip_probs(cm, &header_bc, counts);
 
   if (!frame_is_intra_only(cm)) {
     int i;
 
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i],
-                       cm->counts.inter_mode[i], INTER_MODES, &header_bc);
-
-    vp9_zero(cm->counts.inter_mode);
+      prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i],
+                       counts->inter_mode[i], INTER_MODES, &header_bc);
 
     if (cm->interp_filter == SWITCHABLE)
-      update_switchable_interp_probs(cm, &header_bc);
+      update_switchable_interp_probs(cm, &header_bc, counts);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
-                                cm->counts.intra_inter[i]);
+                                counts->intra_inter[i]);
 
-    if (cm->allow_comp_inter_inter) {
+    if (cpi->allow_comp_inter_inter) {
       const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
 
@@ -1200,33 +1184,34 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
         if (use_hybrid_pred)
           for (i = 0; i < COMP_INTER_CONTEXTS; i++)
             vp9_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      cm->counts.comp_inter[i]);
+                                      counts->comp_inter[i]);
       }
     }
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  cm->counts.single_ref[i][0]);
+                                  counts->single_ref[i][0]);
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  cm->counts.single_ref[i][1]);
+                                  counts->single_ref[i][1]);
       }
     }
 
     if (cm->reference_mode != SINGLE_REFERENCE)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  cm->counts.comp_ref[i]);
+                                  counts->comp_ref[i]);
 
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
-      prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i],
-                       cm->counts.y_mode[i], INTRA_MODES, &header_bc);
+      prob_diff_update(vp9_intra_mode_tree, cm->fc->y_mode_prob[i],
+                       counts->y_mode[i], INTRA_MODES, &header_bc);
 
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
       prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
-                       cm->counts.partition[i], PARTITION_TYPES, &header_bc);
+                       counts->partition[i], PARTITION_TYPES, &header_bc);
 
-    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc);
+    vp9_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
+                        &counts->mv);
   }
 
   vp9_stop_encode(&header_bc);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
index b48826140f9..da6b4146422 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h
@@ -18,8 +18,6 @@ extern "C" {
 
 #include "vp9/encoder/vp9_encoder.h"
 
-void vp9_entropy_mode_init();
-
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
@@ -29,7 +27,7 @@ static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
           (is_two_pass_svc(cpi) &&
            cpi->svc.spatial_layer_id == 0 &&
            cpi->svc.layer_context[0].gold_ref_idx >=0 &&
-           cpi->oxcf.ss_play_alternate[0]));
+           cpi->oxcf.ss_enable_auto_arf[0]));
 }
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
index 5194c4c276b..04a1b8f3c2f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_block.h
@@ -40,8 +40,6 @@ struct macroblock_plane {
   int16_t *round;
 
   int64_t quant_thred[2];
-  // Zbin Over Quant value
-  int16_t zbin_extra;
 };
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
@@ -67,6 +65,11 @@ struct macroblock {
   int rdmult;
   int mb_energy;
 
+  // These are set to their default values at the beginning, and then adjusted
+  // further in the encoding process.
+  BLOCK_SIZE min_partition_size;
+  BLOCK_SIZE max_partition_size;
+
   int mv_best_ref_index[MAX_REF_FRAMES];
   unsigned int max_mv_context[MAX_REF_FRAMES];
   unsigned int source_variance;
@@ -98,8 +101,6 @@ struct macroblock {
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
 
-  int in_static_area;
-
   int optimize;
 
   // indicate if it is in the rd search loop or encoding process
@@ -117,6 +118,10 @@ struct macroblock {
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
 
+  // Strong color activity detection. Used in RTC coding mode to enhance
+  // the visual quality at the boundary of moving color objects.
+  uint8_t color_sensitivity[2];
+
   void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
   void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_blockiness.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_blockiness.c
new file mode 100644
index 00000000000..b8629bd3bb5
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_blockiness.c
@@ -0,0 +1,138 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_convolve.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+static int horizontal_filter(const uint8_t *s) {
+  return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+  return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+  return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+//              p0 p1 p2 p3
+//              q0 q1 q2 q3
+// block edge ->
+//              r0 r1 r2 r3
+//              s0 s1 s2 s3
+
+// blockiness =  p0*-2+q0*6+r0*-6+s0*2 +
+//               p1*-2+q1*6+r1*-6+s1*2 +
+//               p2*-2+q2*6+r2*-6+s2*2 +
+//               p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+//                                blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                        int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, s += sp, r += rp) {
+    s_blockiness += horizontal_filter(s);
+    r_blockiness += horizontal_filter(r);
+    sum_0 += s[0];
+    sum_sq_0 += s[0]*s[0];
+    sum_1 += s[-1];
+    sum_sq_1 += s[-1]*s[-1];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                          int size) {
+  int s_blockiness = 0;
+  int r_blockiness = 0;
+  int sum_0 = 0;
+  int sum_sq_0 = 0;
+  int sum_1 = 0;
+  int sum_sq_1 = 0;
+  int i;
+  int var_0;
+  int var_1;
+  for (i = 0; i < size; ++i, ++s, ++r) {
+    s_blockiness += vertical_filter(s, sp);
+    r_blockiness += vertical_filter(r, rp);
+    sum_0 += s[0];
+    sum_sq_0 += s[0] * s[0];
+    sum_1 += s[-sp];
+    sum_sq_1 += s[-sp] * s[-sp];
+  }
+  var_0 = variance(sum_0, sum_sq_0, size);
+  var_1 = variance(sum_1, sum_sq_1, size);
+  r_blockiness = abs(r_blockiness);
+  s_blockiness = abs(s_blockiness);
+
+  if (r_blockiness > s_blockiness)
+    return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+  else
+    return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
+                          const unsigned char *img2, int img2_pitch,
+                          int width, int height ) {
+  double blockiness = 0;
+  int i, j;
+  vp9_clear_system_state();
+  for (i = 0; i < height; i += 4, img1 += img1_pitch * 4,
+       img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4) {
+      if (i > 0 && i < height && j > 0 && j < width) {
+        blockiness += blockiness_vertical(img1 + j, img1_pitch,
+                                          img2 + j, img2_pitch, 4);
+        blockiness += blockiness_horizontal(img1 + j, img1_pitch,
+                                            img2 + j, img2_pitch, 4);
+      }
+    }
+  }
+  blockiness /= width * height / 16;
+  return blockiness;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
index 12acc51143a..f647ab39535 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.c
@@ -87,7 +87,7 @@ static void free_tree_contexts(PC_TREE *tree) {
 // partition level. There are contexts for none, horizontal, vertical, and
 // split.  Along with a block_size value and a selected block_size which
 // represents the state of our search.
-void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
+void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
   int i, j;
   const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
@@ -97,24 +97,24 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
   int square_index = 1;
   int nodes;
 
-  vpx_free(cpi->leaf_tree);
-  CHECK_MEM_ERROR(cm, cpi->leaf_tree, vpx_calloc(leaf_nodes,
-                                                 sizeof(*cpi->leaf_tree)));
-  vpx_free(cpi->pc_tree);
-  CHECK_MEM_ERROR(cm, cpi->pc_tree, vpx_calloc(tree_nodes,
-                                               sizeof(*cpi->pc_tree)));
+  vpx_free(td->leaf_tree);
+  CHECK_MEM_ERROR(cm, td->leaf_tree, vpx_calloc(leaf_nodes,
+                                                sizeof(*td->leaf_tree)));
+  vpx_free(td->pc_tree);
+  CHECK_MEM_ERROR(cm, td->pc_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->pc_tree)));
 
-  this_pc = &cpi->pc_tree[0];
-  this_leaf = &cpi->leaf_tree[0];
+  this_pc = &td->pc_tree[0];
+  this_leaf = &td->leaf_tree[0];
 
   // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
   // context so we only need to allocate 1 for each 8x8 block.
   for (i = 0; i < leaf_nodes; ++i)
-    alloc_mode_context(cm, 1, &cpi->leaf_tree[i]);
+    alloc_mode_context(cm, 1, &td->leaf_tree[i]);
 
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
-    PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+    PC_TREE *const tree = &td->pc_tree[pc_tree_index];
     tree->block_size = square[0];
     alloc_tree_contexts(cm, tree, 4);
     tree->leaf_split[0] = this_leaf++;
@@ -126,7 +126,7 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
   // from leafs to the root.
   for (nodes = 16; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
-      PC_TREE *const tree = &cpi->pc_tree[pc_tree_index];
+      PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
       tree->block_size = square[square_index];
       for (j = 0; j < 4; j++)
@@ -135,24 +135,24 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, VP9_COMP *cpi) {
     }
     ++square_index;
   }
-  cpi->pc_root = &cpi->pc_tree[tree_nodes - 1];
-  cpi->pc_root[0].none.best_mode_index = 2;
+  td->pc_root = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[0].none.best_mode_index = 2;
 }
 
-void vp9_free_pc_tree(VP9_COMP *cpi) {
+void vp9_free_pc_tree(ThreadData *td) {
   const int tree_nodes = 64 + 16 + 4 + 1;
   int i;
 
   // Set up all 4x4 mode contexts
   for (i = 0; i < 64; ++i)
-    free_mode_context(&cpi->leaf_tree[i]);
+    free_mode_context(&td->leaf_tree[i]);
 
   // Sets up all the leaf nodes in the tree.
   for (i = 0; i < tree_nodes; ++i)
-    free_tree_contexts(&cpi->pc_tree[i]);
+    free_tree_contexts(&td->pc_tree[i]);
 
-  vpx_free(cpi->pc_tree);
-  cpi->pc_tree = NULL;
-  vpx_free(cpi->leaf_tree);
-  cpi->leaf_tree = NULL;
+  vpx_free(td->pc_tree);
+  td->pc_tree = NULL;
+  vpx_free(td->leaf_tree);
+  td->leaf_tree = NULL;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
index 6b28ee59182..70bf032c34f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_context_tree.h
@@ -15,6 +15,7 @@
 
 struct VP9_COMP;
 struct VP9Common;
+struct ThreadData;
 
 // Structure to hold snapshot of coding context during the mode picking process
 typedef struct {
@@ -34,6 +35,7 @@ typedef struct {
   int is_coded;
   int num_4x4_blk;
   int skip;
+  int pred_pixel_ready;
   // For current partition, only if all Y, U, and V transform blocks'
   // coefficients are quantized to 0, skippable is set to 0.
   int skippable;
@@ -45,6 +47,11 @@ typedef struct {
   int64_t tx_rd_diff[TX_MODES];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
+  // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+  // scope of refactoring.
+  int rate;
+  int64_t dist;
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   unsigned int newmv_sse;
   unsigned int zeromv_sse;
@@ -73,7 +80,7 @@ typedef struct PC_TREE {
   };
 } PC_TREE;
 
-void vp9_setup_pc_tree(struct VP9Common *cm, struct VP9_COMP *cpi);
-void vp9_free_pc_tree(struct VP9_COMP *cpi);
+void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
+void vp9_free_pc_tree(struct ThreadData *td);
 
 #endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
index 1090d04bb18..9e6ca3d594c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.c
@@ -17,6 +17,7 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_dct.h"
 
 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
@@ -26,7 +27,7 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
   return rv;
 }
 
-static void fdct4(const tran_low_t *input, tran_low_t *output) {
+void vp9_fdct4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t step[4];
   tran_high_t temp1, temp2;
 
@@ -37,12 +38,12 @@ static void fdct4(const tran_low_t *input, tran_low_t *output) {
 
   temp1 = (step[0] + step[1]) * cospi_16_64;
   temp2 = (step[0] - step[1]) * cospi_16_64;
-  output[0] = fdct_round_shift(temp1);
-  output[2] = fdct_round_shift(temp2);
+  output[0] = (tran_low_t)fdct_round_shift(temp1);
+  output[2] = (tran_low_t)fdct_round_shift(temp2);
   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-  output[1] = fdct_round_shift(temp1);
-  output[3] = fdct_round_shift(temp2);
+  output[1] = (tran_low_t)fdct_round_shift(temp1);
+  output[3] = (tran_low_t)fdct_round_shift(temp2);
 }
 
 void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
@@ -98,12 +99,12 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
       step[3] = input[0] - input[3];
       temp1 = (step[0] + step[1]) * cospi_16_64;
       temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = fdct_round_shift(temp1);
-      out[2] = fdct_round_shift(temp2);
+      out[0] = (tran_low_t)fdct_round_shift(temp1);
+      out[2] = (tran_low_t)fdct_round_shift(temp2);
       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = fdct_round_shift(temp1);
-      out[3] = fdct_round_shift(temp2);
+      out[1] = (tran_low_t)fdct_round_shift(temp1);
+      out[3] = (tran_low_t)fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
       in_pass0++;
       in++;
@@ -123,7 +124,7 @@ void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
   }
 }
 
-static void fadst4(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -157,26 +158,18 @@ static void fadst4(const tran_low_t *input, tran_low_t *output) {
   s3 = x2 - x0 + x3;
 
   // 1-D transform scaling factor is sqrt(2).
-  output[0] = fdct_round_shift(s0);
-  output[1] = fdct_round_shift(s1);
-  output[2] = fdct_round_shift(s2);
-  output[3] = fdct_round_shift(s3);
+  output[0] = (tran_low_t)fdct_round_shift(s0);
+  output[1] = (tran_low_t)fdct_round_shift(s1);
+  output[2] = (tran_low_t)fdct_round_shift(s2);
+  output[3] = (tran_low_t)fdct_round_shift(s3);
 }
 
-static const transform_2d FHT_4[] = {
-  { fdct4,  fdct4  },  // DCT_DCT  = 0
-  { fadst4, fdct4  },  // ADST_DCT = 1
-  { fdct4,  fadst4 },  // DCT_ADST = 2
-  { fadst4, fadst4 }   // ADST_ADST = 3
-};
-
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vp9_fdct4x4_c(input, output, stride);
   } else {
     tran_low_t out[4 * 4];
-    tran_low_t *outptr = &out[0];
     int i, j;
     tran_low_t temp_in[4], temp_out[4];
     const transform_2d ht = FHT_4[tx_type];
@@ -189,7 +182,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
         temp_in[0] += 1;
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 4; ++j)
-        outptr[j * 4 + i] = temp_out[j];
+        out[j * 4 + i] = temp_out[j];
     }
 
     // Rows
@@ -203,7 +196,7 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
   }
 }
 
-static void fdct8(const tran_low_t *input, tran_low_t *output) {
+void vp9_fdct8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
   tran_high_t t0, t1, t2, t3;                  // needs32
   tran_high_t x0, x1, x2, x3;                  // canbe16
@@ -227,16 +220,16 @@ static void fdct8(const tran_low_t *input, tran_low_t *output) {
   t1 = (x0 - x1) * cospi_16_64;
   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[0] = fdct_round_shift(t0);
-  output[2] = fdct_round_shift(t2);
-  output[4] = fdct_round_shift(t1);
-  output[6] = fdct_round_shift(t3);
+  output[0] = (tran_low_t)fdct_round_shift(t0);
+  output[2] = (tran_low_t)fdct_round_shift(t2);
+  output[4] = (tran_low_t)fdct_round_shift(t1);
+  output[6] = (tran_low_t)fdct_round_shift(t3);
 
   // Stage 2
   t0 = (s6 - s5) * cospi_16_64;
   t1 = (s6 + s5) * cospi_16_64;
-  t2 = fdct_round_shift(t0);
-  t3 = fdct_round_shift(t1);
+  t2 = (tran_low_t)fdct_round_shift(t0);
+  t3 = (tran_low_t)fdct_round_shift(t1);
 
   // Stage 3
   x0 = s4 + t2;
@@ -249,10 +242,10 @@ static void fdct8(const tran_low_t *input, tran_low_t *output) {
   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[1] = fdct_round_shift(t0);
-  output[3] = fdct_round_shift(t2);
-  output[5] = fdct_round_shift(t1);
-  output[7] = fdct_round_shift(t3);
+  output[1] = (tran_low_t)fdct_round_shift(t0);
+  output[3] = (tran_low_t)fdct_round_shift(t2);
+  output[5] = (tran_low_t)fdct_round_shift(t1);
+  output[7] = (tran_low_t)fdct_round_shift(t3);
 }
 
 void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
@@ -298,10 +291,10 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = (x0 - x1) * cospi_16_64;
       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-      output[0 * 8] = fdct_round_shift(t0);
-      output[2 * 8] = fdct_round_shift(t2);
-      output[4 * 8] = fdct_round_shift(t1);
-      output[6 * 8] = fdct_round_shift(t3);
+      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
 
       // Stage 2
       t0 = (s6 - s5) * cospi_16_64;
@@ -320,10 +313,10 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-      output[1 * 8] = fdct_round_shift(t0);
-      output[3 * 8] = fdct_round_shift(t2);
-      output[5 * 8] = fdct_round_shift(t1);
-      output[7 * 8] = fdct_round_shift(t3);
+      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
       input++;
       output++;
     }
@@ -331,12 +324,124 @@ void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    fdct8(&intermediate[i * 8], &final_output[i * 8]);
+    vp9_fdct8(&intermediate[i * 8], &final_output[i * 8]);
     for (j = 0; j < 8; ++j)
       final_output[j + i * 8] /= 2;
   }
 }
 
+void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
+                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
+  int eob = -1;
+
+  int i, j;
+  tran_low_t intermediate[64];
+
+  // Transform columns
+  {
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    int i;
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      s0 = (input[0 * stride] + input[7 * stride]) * 4;
+      s1 = (input[1 * stride] + input[6 * stride]) * 4;
+      s2 = (input[2 * stride] + input[5 * stride]) * 4;
+      s3 = (input[3 * stride] + input[4 * stride]) * 4;
+      s4 = (input[3 * stride] - input[4 * stride]) * 4;
+      s5 = (input[2 * stride] - input[5 * stride]) * 4;
+      s6 = (input[1 * stride] - input[6 * stride]) * 4;
+      s7 = (input[0 * stride] - input[7 * stride]) * 4;
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
+      t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
+      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 *   cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
+      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
+      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
+      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
+      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
+      input++;
+      output++;
+    }
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    vp9_fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
+    for (j = 0; j < 8; ++j)
+      coeff_ptr[j + i * 8] /= 2;
+  }
+
+  // TODO(jingning) Decide the need of these arguments after the
+  // quantization process is completed.
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp)
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
 void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
   int r, c;
   tran_low_t sum = 0;
@@ -434,10 +539,10 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         t1 = (x0 - x1) * cospi_16_64;
         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = fdct_round_shift(t0);
-        out[4] = fdct_round_shift(t2);
-        out[8] = fdct_round_shift(t1);
-        out[12] = fdct_round_shift(t3);
+        out[0] = (tran_low_t)fdct_round_shift(t0);
+        out[4] = (tran_low_t)fdct_round_shift(t2);
+        out[8] = (tran_low_t)fdct_round_shift(t1);
+        out[12] = (tran_low_t)fdct_round_shift(t3);
 
         // Stage 2
         t0 = (s6 - s5) * cospi_16_64;
@@ -456,10 +561,10 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-        out[2] = fdct_round_shift(t0);
-        out[6] = fdct_round_shift(t2);
-        out[10] = fdct_round_shift(t1);
-        out[14] = fdct_round_shift(t3);
+        out[2] = (tran_low_t)fdct_round_shift(t0);
+        out[6] = (tran_low_t)fdct_round_shift(t2);
+        out[10] = (tran_low_t)fdct_round_shift(t1);
+        out[14] = (tran_low_t)fdct_round_shift(t3);
       }
       // Work on the next eight values; step1 -> odd_results
       {
@@ -502,20 +607,20 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
         // step 6
         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = fdct_round_shift(temp1);
-        out[9] = fdct_round_shift(temp2);
+        out[1] = (tran_low_t)fdct_round_shift(temp1);
+        out[9] = (tran_low_t)fdct_round_shift(temp2);
         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = fdct_round_shift(temp1);
-        out[13] = fdct_round_shift(temp2);
+        out[5] = (tran_low_t)fdct_round_shift(temp1);
+        out[13] = (tran_low_t)fdct_round_shift(temp2);
         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = fdct_round_shift(temp1);
-        out[11] = fdct_round_shift(temp2);
+        out[3] = (tran_low_t)fdct_round_shift(temp1);
+        out[11] = (tran_low_t)fdct_round_shift(temp2);
         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = fdct_round_shift(temp1);
-        out[15] = fdct_round_shift(temp2);
+        out[7] = (tran_low_t)fdct_round_shift(temp1);
+        out[15] = (tran_low_t)fdct_round_shift(temp2);
       }
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
@@ -528,7 +633,7 @@ void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
   }
 }
 
-static void fadst8(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst8(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
   tran_high_t x0 = input[7];
@@ -589,30 +694,22 @@ static void fadst8(const tran_low_t *input, tran_low_t *output) {
   x6 = fdct_round_shift(s6);
   x7 = fdct_round_shift(s7);
 
-  output[0] =   x0;
-  output[1] = - x4;
-  output[2] =   x6;
-  output[3] = - x2;
-  output[4] =   x3;
-  output[5] = - x7;
-  output[6] =   x5;
-  output[7] = - x1;
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x4;
+  output[2] = (tran_low_t)x6;
+  output[3] = (tran_low_t)-x2;
+  output[4] = (tran_low_t)x3;
+  output[5] = (tran_low_t)-x7;
+  output[6] = (tran_low_t)x5;
+  output[7] = (tran_low_t)-x1;
 }
 
-static const transform_2d FHT_8[] = {
-  { fdct8,  fdct8  },  // DCT_DCT  = 0
-  { fadst8, fdct8  },  // ADST_DCT = 1
-  { fdct8,  fadst8 },  // DCT_ADST = 2
-  { fadst8, fadst8 }   // ADST_ADST = 3
-};
-
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vp9_fdct8x8_c(input, output, stride);
   } else {
     tran_low_t out[64];
-    tran_low_t *outptr = &out[0];
     int i, j;
     tran_low_t temp_in[8], temp_out[8];
     const transform_2d ht = FHT_8[tx_type];
@@ -623,7 +720,7 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
         temp_in[j] = input[j * stride + i] * 4;
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 8; ++j)
-        outptr[j * 8 + i] = temp_out[j];
+        out[j * 8 + i] = temp_out[j];
     }
 
     // Rows
@@ -659,10 +756,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
     c1 = e1 - c1;
     a1 -= c1;
     d1 += b1;
-    op[0] = a1;
-    op[4] = c1;
-    op[8] = d1;
-    op[12] = b1;
+    op[0] = (tran_low_t)a1;
+    op[4] = (tran_low_t)c1;
+    op[8] = (tran_low_t)d1;
+    op[12] = (tran_low_t)b1;
 
     ip_pass0++;
     op++;
@@ -683,10 +780,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
     c1 = e1 - c1;
     a1 -= c1;
     d1 += b1;
-    op[0] = a1 * UNIT_QUANT_FACTOR;
-    op[1] = c1 * UNIT_QUANT_FACTOR;
-    op[2] = d1 * UNIT_QUANT_FACTOR;
-    op[3] = b1 * UNIT_QUANT_FACTOR;
+    op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+    op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+    op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+    op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
 
     ip += 4;
     op += 4;
@@ -694,7 +791,7 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
 }
 
 // Rewrote to use same algorithm as others.
-static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]) {
   tran_high_t step1[8];      // canbe16
   tran_high_t step2[8];      // canbe16
   tran_high_t step3[8];      // canbe16
@@ -745,10 +842,10 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
     t1 = (x0 - x1) * cospi_16_64;
     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = fdct_round_shift(t0);
-    out[4] = fdct_round_shift(t2);
-    out[8] = fdct_round_shift(t1);
-    out[12] = fdct_round_shift(t3);
+    out[0] = (tran_low_t)fdct_round_shift(t0);
+    out[4] = (tran_low_t)fdct_round_shift(t2);
+    out[8] = (tran_low_t)fdct_round_shift(t1);
+    out[12] = (tran_low_t)fdct_round_shift(t3);
 
     // Stage 2
     t0 = (s6 - s5) * cospi_16_64;
@@ -767,10 +864,10 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    out[2] = fdct_round_shift(t0);
-    out[6] = fdct_round_shift(t2);
-    out[10] = fdct_round_shift(t1);
-    out[14] = fdct_round_shift(t3);
+    out[2] = (tran_low_t)fdct_round_shift(t0);
+    out[6] = (tran_low_t)fdct_round_shift(t2);
+    out[10] = (tran_low_t)fdct_round_shift(t1);
+    out[14] = (tran_low_t)fdct_round_shift(t3);
   }
 
   // step 2
@@ -816,26 +913,26 @@ static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
   // step 6
   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = fdct_round_shift(temp1);
-  out[9] = fdct_round_shift(temp2);
+  out[1] = (tran_low_t)fdct_round_shift(temp1);
+  out[9] = (tran_low_t)fdct_round_shift(temp2);
 
   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = fdct_round_shift(temp1);
-  out[13] = fdct_round_shift(temp2);
+  out[5] = (tran_low_t)fdct_round_shift(temp1);
+  out[13] = (tran_low_t)fdct_round_shift(temp2);
 
   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = fdct_round_shift(temp1);
-  out[11] = fdct_round_shift(temp2);
+  out[3] = (tran_low_t)fdct_round_shift(temp1);
+  out[11] = (tran_low_t)fdct_round_shift(temp2);
 
   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = fdct_round_shift(temp1);
-  out[15] = fdct_round_shift(temp2);
+  out[7] = (tran_low_t)fdct_round_shift(temp1);
+  out[15] = (tran_low_t)fdct_round_shift(temp2);
 }
 
-static void fadst16(const tran_low_t *input, tran_low_t *output) {
+void vp9_fadst16(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
 
@@ -980,38 +1077,30 @@ static void fadst16(const tran_low_t *input, tran_low_t *output) {
   x14 = fdct_round_shift(s14);
   x15 = fdct_round_shift(s15);
 
-  output[0] = x0;
-  output[1] = - x8;
-  output[2] = x12;
-  output[3] = - x4;
-  output[4] = x6;
-  output[5] = x14;
-  output[6] = x10;
-  output[7] = x2;
-  output[8] = x3;
-  output[9] =  x11;
-  output[10] = x15;
-  output[11] = x7;
-  output[12] = x5;
-  output[13] = - x13;
-  output[14] = x9;
-  output[15] = - x1;
+  output[0] = (tran_low_t)x0;
+  output[1] = (tran_low_t)-x8;
+  output[2] = (tran_low_t)x12;
+  output[3] = (tran_low_t)-x4;
+  output[4] = (tran_low_t)x6;
+  output[5] = (tran_low_t)x14;
+  output[6] = (tran_low_t)x10;
+  output[7] = (tran_low_t)x2;
+  output[8] = (tran_low_t)x3;
+  output[9] = (tran_low_t)x11;
+  output[10] = (tran_low_t)x15;
+  output[11] = (tran_low_t)x7;
+  output[12] = (tran_low_t)x5;
+  output[13] = (tran_low_t)-x13;
+  output[14] = (tran_low_t)x9;
+  output[15] = (tran_low_t)-x1;
 }
 
-static const transform_2d FHT_16[] = {
-  { fdct16,  fdct16  },  // DCT_DCT  = 0
-  { fadst16, fdct16  },  // ADST_DCT = 1
-  { fdct16,  fadst16 },  // DCT_ADST = 2
-  { fadst16, fadst16 }   // ADST_ADST = 3
-};
-
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
                     int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vp9_fdct16x16_c(input, output, stride);
   } else {
     tran_low_t out[256];
-    tran_low_t *outptr = &out[0];
     int i, j;
     tran_low_t temp_in[16], temp_out[16];
     const transform_2d ht = FHT_16[tx_type];
@@ -1022,7 +1111,7 @@ void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
         temp_in[j] = input[j * stride + i] * 4;
       ht.cols(temp_in, temp_out);
       for (j = 0; j < 16; ++j)
-        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+        out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
     }
 
     // Rows
@@ -1049,7 +1138,7 @@ static INLINE tran_high_t half_round_shift(tran_high_t input) {
   return rv;
 }
 
-static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
   tran_high_t step[32];
   // Stage 1
   step[0] = input[0] + input[(32 - 1)];
@@ -1392,7 +1481,7 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    fdct32(temp_in, temp_out, 0);
+    vp9_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
@@ -1402,9 +1491,10 @@ void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    fdct32(temp_in, temp_out, 0);
+    vp9_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
-      out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+      out[j + i * 32] =
+          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
   }
 }
 
@@ -1420,7 +1510,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
     tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = input[j * stride + i] * 4;
-    fdct32(temp_in, temp_out, 0);
+    vp9_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
@@ -1433,9 +1523,9 @@ void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
     tran_high_t temp_in[32], temp_out[32];
     for (j = 0; j < 32; ++j)
       temp_in[j] = output[j + i * 32];
-    fdct32(temp_in, temp_out, 1);
+    vp9_fdct32(temp_in, temp_out, 1);
     for (j = 0; j < 32; ++j)
-      out[j + i * 32] = temp_out[j];
+      out[j + i * 32] = (tran_low_t)temp_out[j];
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h
new file mode 100644
index 00000000000..49afcbbd5b8
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_dct.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_DCT_H_
+#define VP9_ENCODER_VP9_DCT_H_
+
+#include "vp9/common/vp9_idct.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp9_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride);
+void vp9_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                               int stride);
+
+void vp9_fdct4(const tran_low_t *input, tran_low_t *output);
+void vp9_fadst4(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct8(const tran_low_t *input, tran_low_t *output);
+void vp9_fadst8(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct16(const tran_low_t in[16], tran_low_t out[16]);
+void vp9_fadst16(const tran_low_t *input, tran_low_t *output);
+void vp9_fdct32(const tran_high_t *input, tran_high_t *output, int round);
+
+static const transform_2d FHT_4[] = {
+  { vp9_fdct4,  vp9_fdct4  },  // DCT_DCT  = 0
+  { vp9_fadst4, vp9_fdct4  },  // ADST_DCT = 1
+  { vp9_fdct4,  vp9_fadst4 },  // DCT_ADST = 2
+  { vp9_fadst4, vp9_fadst4 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_8[] = {
+  { vp9_fdct8,  vp9_fdct8  },  // DCT_DCT  = 0
+  { vp9_fadst8, vp9_fdct8  },  // ADST_DCT = 1
+  { vp9_fdct8,  vp9_fadst8 },  // DCT_ADST = 2
+  { vp9_fadst8, vp9_fadst8 }   // ADST_ADST = 3
+};
+
+static const transform_2d FHT_16[] = {
+  { vp9_fdct16,  vp9_fdct16  },  // DCT_DCT  = 0
+  { vp9_fadst16, vp9_fdct16  },  // ADST_DCT = 1
+  { vp9_fdct16,  vp9_fadst16 },  // DCT_ADST = 2
+  { vp9_fadst16, vp9_fadst16 }   // ADST_ADST = 3
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_DCT_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
index 4deeed2170c..08134e152aa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.c
@@ -45,34 +45,29 @@ static int delta_thresh(BLOCK_SIZE bs, int increase_denoising) {
 static int noise_motion_thresh(BLOCK_SIZE bs, int increase_denoising) {
   (void)bs;
   (void)increase_denoising;
-  return 25 * 25;
+  return 625;
 }
 
 static unsigned int sse_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (4 << b_width_log2_lookup[bs]) *
-         (4 << b_height_log2_lookup[bs]) *
-         (increase_denoising ? 60 : 40);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 60 : 40);
 }
 
 static int sse_diff_thresh(BLOCK_SIZE bs, int increase_denoising,
-                           int mv_row, int mv_col) {
-  if (mv_row * mv_row + mv_col * mv_col >
+                           int motion_magnitude) {
+  if (motion_magnitude >
       noise_motion_thresh(bs, increase_denoising)) {
     return 0;
   } else {
-    return (4 << b_width_log2_lookup[bs]) *
-           (4 << b_height_log2_lookup[bs]) * 20;
+    return (1 << num_pels_log2_lookup[bs]) * 20;
   }
 }
 
 int total_adj_strong_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (4 << b_width_log2_lookup[bs]) *
-         (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 
 static int total_adj_weak_thresh(BLOCK_SIZE bs, int increase_denoising) {
-  return (4 << b_width_log2_lookup[bs]) *
-         (4 << b_height_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
+  return (1 << num_pels_log2_lookup[bs]) * (increase_denoising ? 3 : 2);
 }
 
 // TODO(jackychen): If increase_denoising is enabled in the future,
@@ -195,16 +190,6 @@ static uint8_t *block_start(uint8_t *framebuf, int stride,
   return framebuf + (stride * mi_row * 8) + (mi_col * 8);
 }
 
-static void copy_block(uint8_t *dest, int dest_stride,
-                       const uint8_t *src, int src_stride, BLOCK_SIZE bs) {
-  int r;
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
-    vpx_memcpy(dest, src, (4 << b_width_log2_lookup[bs]));
-    dest += dest_stride;
-    src += src_stride;
-  }
-}
-
 static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
                                                          MACROBLOCK *mb,
                                                          BLOCK_SIZE bs,
@@ -218,33 +203,23 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
   int sse_diff = ctx->zeromv_sse - ctx->newmv_sse;
   MV_REFERENCE_FRAME frame;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
-  MB_MODE_INFO *mbmi = &filter_mbd->mi[0].src_mi->mbmi;
-
+  MB_MODE_INFO *mbmi = &filter_mbd->mi[0]->mbmi;
   MB_MODE_INFO saved_mbmi;
   int i, j;
   struct buf_2d saved_dst[MAX_MB_PLANE];
   struct buf_2d saved_pre[MAX_MB_PLANE][2];  // 2 pre buffers
 
-  // We will restore these after motion compensation.
-  saved_mbmi = *mbmi;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (j = 0; j < 2; ++j) {
-      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
-    }
-    saved_dst[i] = filter_mbd->plane[i].dst;
-  }
-
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
-
   *motion_magnitude = mv_row * mv_row + mv_col * mv_col;
-
   frame = ctx->best_reference_frame;
 
+  saved_mbmi = *mbmi;
+
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
   if (frame != INTRA_FRAME &&
-      sse_diff > sse_diff_thresh(bs, increase_denoising, mv_row, mv_col)) {
+      sse_diff > sse_diff_thresh(bs, increase_denoising, *motion_magnitude)) {
     mbmi->ref_frame[0] = ctx->best_reference_frame;
     mbmi->mode = ctx->best_sse_inter_mode;
     mbmi->mv[0] = ctx->best_sse_mv;
@@ -261,6 +236,26 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
     ctx->newmv_sse = ctx->zeromv_sse;
   }
 
+  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+  if (*motion_magnitude >
+     (noise_motion_thresh(bs, increase_denoising) << 3)) {
+    // Restore everything to its original state
+    *mbmi = saved_mbmi;
+    return COPY_BLOCK;
+  }
+
+  // We will restore these after motion compensation.
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (j = 0; j < 2; ++j) {
+      saved_pre[i][j] = filter_mbd->plane[i].pre[j];
+    }
+    saved_dst[i] = filter_mbd->plane[i].dst;
+  }
+
   // Set the pointers in the MACROBLOCKD to point to the buffers in the denoiser
   // struct.
   for (j = 0; j < 2; ++j) {
@@ -313,13 +308,6 @@ static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
   mv_row = ctx->best_sse_mv.as_mv.row;
   mv_col = ctx->best_sse_mv.as_mv.col;
 
-  if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
-    return COPY_BLOCK;
-  }
-  if (mv_row * mv_row + mv_col * mv_col >
-      8 * noise_motion_thresh(bs, increase_denoising)) {
-    return COPY_BLOCK;
-  }
   return FILTER_BLOCK;
 }
 
@@ -348,9 +336,15 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
   }
 
   if (decision == FILTER_BLOCK) {
-    copy_block(src.buf, src.stride, avg_start, avg.y_stride, bs);
+    vp9_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
   } else {  // COPY_BLOCK
-    copy_block(avg_start, avg.y_stride, src.buf, src.stride, bs);
+    vp9_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride,
+                      NULL, 0, NULL, 0,
+                      num_4x4_blocks_wide_lookup[bs] << 2,
+                      num_4x4_blocks_high_lookup[bs] << 2);
   }
 }
 
@@ -358,16 +352,26 @@ static void copy_frame(YV12_BUFFER_CONFIG dest, const YV12_BUFFER_CONFIG src) {
   int r;
   const uint8_t *srcbuf = src.y_buffer;
   uint8_t *destbuf = dest.y_buffer;
+
   assert(dest.y_width == src.y_width);
   assert(dest.y_height == src.y_height);
 
   for (r = 0; r < dest.y_height; ++r) {
-    vpx_memcpy(destbuf, srcbuf, dest.y_width);
+    memcpy(destbuf, srcbuf, dest.y_width);
     destbuf += dest.y_stride;
     srcbuf += src.y_stride;
   }
 }
 
+static void swap_frame_buffer(YV12_BUFFER_CONFIG *dest,
+                              YV12_BUFFER_CONFIG *src) {
+  uint8_t *tmp_buf = dest->y_buffer;
+  assert(dest->y_width == src->y_width);
+  assert(dest->y_height == src->y_height);
+  dest->y_buffer = src->y_buffer;
+  src->y_buffer = tmp_buf;
+}
+
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
@@ -377,22 +381,23 @@ void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
   if (frame_type == KEY_FRAME) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
-    for (i = 1; i < MAX_REF_FRAMES; ++i) {
+    for (i = 1; i < MAX_REF_FRAMES; ++i)
       copy_frame(denoiser->running_avg_y[i], src);
-    }
-  } else {  /* For non key frames */
-    if (refresh_alt_ref_frame) {
-      copy_frame(denoiser->running_avg_y[ALTREF_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_golden_frame) {
-      copy_frame(denoiser->running_avg_y[GOLDEN_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
-    if (refresh_last_frame) {
-      copy_frame(denoiser->running_avg_y[LAST_FRAME],
-                 denoiser->running_avg_y[INTRA_FRAME]);
-    }
+    return;
+  }
+
+  /* For non key frames */
+  if (refresh_alt_ref_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_golden_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+  if (refresh_last_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
   }
 }
 
@@ -410,7 +415,7 @@ void vp9_denoiser_update_frame_stats(MB_MODE_INFO *mbmi, unsigned int sse,
     ctx->best_zeromv_reference_frame = mbmi->ref_frame[0];
   }
 
-  if (mode == NEWMV) {
+  if (mbmi->mv[0].as_int != 0 && sse < ctx->newmv_sse) {
     ctx->newmv_sse = sse;
     ctx->best_sse_inter_mode = mode;
     ctx->best_sse_mv = mbmi->mv[0];
@@ -425,6 +430,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 #endif
                        int border) {
   int i, fail;
+  const int legacy_byte_alignment = 0;
   assert(denoiser != NULL);
 
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
@@ -433,7 +439,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 #if CONFIG_VP9_HIGHBITDEPTH
                                   use_highbitdepth,
 #endif
-                                  border);
+                                  border, legacy_byte_alignment);
     if (fail) {
       vp9_denoiser_free(denoiser);
       return 1;
@@ -448,7 +454,7 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 use_highbitdepth,
 #endif
-                                border);
+                                border, legacy_byte_alignment);
   if (fail) {
     vp9_denoiser_free(denoiser);
     return 1;
@@ -457,23 +463,21 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height,
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
   denoiser->increase_denoising = 0;
+  denoiser->frame_buffer_initialized = 1;
 
   return 0;
 }
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   int i;
+  denoiser->frame_buffer_initialized = 0;
   if (denoiser == NULL) {
     return;
   }
   for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    if (&denoiser->running_avg_y[i] != NULL) {
-      vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
-    }
-  }
-  if (&denoiser->mc_running_avg_y != NULL) {
-    vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
+    vp9_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
+  vp9_free_frame_buffer(&denoiser->mc_running_avg_y);
 }
 
 #ifdef OUTPUT_YUV_DENOISED
@@ -482,15 +486,13 @@ static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
   uint8_t *u = yuv->u_buffer;
   uint8_t *v = yuv->v_buffer;
 
-  // The '/2's are there because we have a 440 buffer, but we want to output
-  // 420.
-  for (r = 0; r < yuv->uv_height / 2; ++r) {
-    for (c = 0; c < yuv->uv_width / 2; ++c) {
+  for (r = 0; r < yuv->uv_height; ++r) {
+    for (c = 0; c < yuv->uv_width; ++c) {
       u[c] = UINT8_MAX / 2;
       v[c] = UINT8_MAX / 2;
     }
-    u += yuv->uv_stride + yuv->uv_width / 2;
-    v += yuv->uv_stride + yuv->uv_width / 2;
+    u += yuv->uv_stride;
+    v += yuv->uv_stride;
   }
 }
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
index 421dfcd0cca..8eb5da1b8aa 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_denoiser.h
@@ -29,6 +29,7 @@ typedef struct vp9_denoiser {
   YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
   YV12_BUFFER_CONFIG mc_running_avg_y;
   int increase_denoising;
+  int frame_buffer_initialized;
 } VP9_DENOISER;
 
 void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 6eff8c501ba..0e74784e9b6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -36,6 +36,7 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rd.h"
@@ -43,19 +44,11 @@
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-#define GF_ZEROMV_ZBIN_BOOST 0
-#define LF_ZEROMV_ZBIN_BOOST 0
-#define MV_ZBIN_BOOST        0
-#define SPLIT_MV_ZBIN_BOOST  0
-#define INTRA_ZBIN_BOOST     0
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+static void encode_superblock(VP9_COMP *cpi, ThreadData * td,
+                              TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx);
 
-// Motion vector component magnitude threshold for defining fast motion.
-#define FAST_MOTION_MV_THRESH 24
-
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -106,9 +99,9 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
 };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
-                                              const struct buf_2d *ref,
-                                              BLOCK_SIZE bs) {
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
                                               VP9_VAR_OFFS, 0, &sse);
@@ -116,7 +109,7 @@ static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static unsigned int high_get_sby_perpixel_variance(
+unsigned int vp9_high_get_sby_perpixel_variance(
     VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs, int bd) {
   unsigned int var, sse;
   switch (bd) {
@@ -145,19 +138,21 @@ static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
                                                    const struct buf_2d *ref,
                                                    int mi_row, int mi_col,
                                                    BLOCK_SIZE bs) {
+  unsigned int sse, var;
+  uint8_t *last_y;
   const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
-  const uint8_t* last_y = &last->y_buffer[mi_row * MI_SIZE * last->y_stride +
-                                              mi_col * MI_SIZE];
-  unsigned int sse;
-  const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                                              last_y, last->y_stride, &sse);
+
+  assert(last != NULL);
+  last_y =
+      &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+  var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
-static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
+static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
                                                    int mi_row,
                                                    int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+  unsigned int var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
                                                     mi_row, mi_col,
                                                     BLOCK_64X64);
   if (var < 8)
@@ -170,34 +165,20 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi,
     return BLOCK_8X8;
 }
 
-static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
-                                                      int mi_row,
-                                                      int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
-                                                    mi_row, mi_col,
-                                                    BLOCK_64X64);
-  if (var < 4)
-    return BLOCK_64X64;
-  else if (var < 10)
-    return BLOCK_32X32;
-  else
-    return BLOCK_16X16;
-}
-
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
-static INLINE void set_modeinfo_offsets(VP9_COMMON *const cm,
-                                        MACROBLOCKD *const xd,
-                                        int mi_row,
-                                        int mi_col) {
+static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
+                                         MACROBLOCKD *const xd,
+                                         int mi_row,
+                                         int mi_col) {
   const int idx_str = xd->mi_stride * mi_row + mi_col;
-  xd->mi = cm->mi + idx_str;
-  xd->mi[0].src_mi = &xd->mi[0];
+  xd->mi = cm->mi_grid_visible + idx_str;
+  xd->mi[0] = cm->mi + idx_str;
 }
 
 static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
-                        int mi_row, int mi_col, BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &cpi->mb;
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
@@ -207,9 +188,9 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
 
   set_skip_context(xd, mi_row, mi_col);
 
-  set_modeinfo_offsets(cm, xd, mi_row, mi_col);
+  set_mode_info_offsets(cm, xd, mi_row, mi_col);
 
-  mbmi = &xd->mi[0].src_mi->mbmi;
+  mbmi = &xd->mi[0]->mbmi;
 
   // Set up destination pointers.
   vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
@@ -258,25 +239,24 @@ static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd,
   for (j = 0; j < block_height; ++j)
     for (i = 0; i < block_width; ++i) {
       if (mi_row + j < cm->mi_rows && mi_col + i < cm->mi_cols)
-        xd->mi[j * xd->mi_stride + i].src_mi = &xd->mi[0];
+        xd->mi[j * xd->mi_stride + i] = xd->mi[0];
     }
 }
 
 static void set_block_size(VP9_COMP * const cpi,
+                           MACROBLOCKD *const xd,
                            int mi_row, int mi_col,
                            BLOCK_SIZE bsize) {
   if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
-    MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-    set_modeinfo_offsets(&cpi->common, xd, mi_row, mi_col);
-    xd->mi[0].src_mi->mbmi.sb_type = bsize;
-    duplicate_mode_info_in_sb(&cpi->common, xd, mi_row, mi_col, bsize);
+    set_mode_info_offsets(&cpi->common, xd, mi_row, mi_col);
+    xd->mi[0]->mbmi.sb_type = bsize;
   }
 }
 
 typedef struct {
   int64_t sum_square_error;
   int64_t sum_error;
-  int count;
+  int log2_count;
   int variance;
 } var;
 
@@ -289,6 +269,11 @@ typedef struct {
 typedef struct {
   partition_variance part_variances;
   var split[4];
+} v4x4;
+
+typedef struct {
+  partition_variance part_variances;
+  v4x4 split[4];
 } v8x8;
 
 typedef struct {
@@ -320,7 +305,6 @@ typedef enum {
 static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
   int i;
   node->part_variances = NULL;
-  vpx_memset(node->split, 0, sizeof(node->split));
   switch (bsize) {
     case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
@@ -347,6 +331,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
       v8x8 *vt = (v8x8 *) data;
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++)
+        node->split[i] = &vt->split[i].part_variances.none;
+      break;
+    }
+    case BLOCK_4X4: {
+      v4x4 *vt = (v4x4 *) data;
+      node->part_variances = &vt->part_variances;
+      for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i];
       break;
     }
@@ -361,18 +352,18 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
 static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
-  v->count = c;
-  if (c > 0)
-    v->variance = (int)(256 *
-                        (v->sum_square_error - v->sum_error * v->sum_error /
-                         v->count) / v->count);
-  else
-    v->variance = 0;
+  v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
 }
 
 void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->count + b->count, r);
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
 }
 
 static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
@@ -387,93 +378,312 @@ static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
 }
 
 static int set_vt_partitioning(VP9_COMP *cpi,
+                               MACROBLOCKD *const xd,
                                void *data,
                                BLOCK_SIZE bsize,
                                int mi_row,
-                               int mi_col) {
+                               int mi_col,
+                               int64_t threshold,
+                               BLOCK_SIZE bsize_min,
+                               int force_split) {
   VP9_COMMON * const cm = &cpi->common;
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
-  // TODO(debargha): Choose this more intelligently.
-  const int threshold_multiplier = cm->frame_type == KEY_FRAME ? 64 : 4;
-  int64_t threshold =
-      (int64_t)(threshold_multiplier *
-                vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth));
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
-  // Split none is available only if we have more than half a block size
-  // in width and height inside the visible image.
-  if (mi_col + block_width / 2 < cm->mi_cols &&
-      mi_row + block_height / 2 < cm->mi_rows &&
-      vt.part_variances->none.variance < threshold) {
-    set_block_size(cpi, mi_row, mi_col, bsize);
-    return 1;
-  }
-
-  // Only allow split for blocks above 16x16.
-  if (bsize > BLOCK_16X16) {
-    // Vertical split is available on all but the bottom border.
-    if (mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->vert[0].variance < threshold &&
-        vt.part_variances->vert[1].variance < threshold) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-      set_block_size(cpi, mi_row, mi_col, subsize);
-      set_block_size(cpi, mi_row, mi_col + block_width / 2, subsize);
-      return 1;
-    }
+  if (force_split == 1)
+    return 0;
 
-    // Horizontal split is available on all but the right border.
+  // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+  // variance is below threshold, otherwise split will be selected.
+  // No check for vert/horiz split as too few samples for variance.
+  if (bsize == bsize_min) {
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
-        vt.part_variances->horz[0].variance < threshold &&
-        vt.part_variances->horz[1].variance < threshold) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-      set_block_size(cpi, mi_row, mi_col, subsize);
-      set_block_size(cpi, mi_row + block_height / 2, mi_col, subsize);
+        mi_row + block_height / 2 < cm->mi_rows &&
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, xd, mi_row, mi_col, bsize);
       return 1;
     }
-  }
-
-  // This will only allow 8x8 if the 16x16 variance is very large.
-  if (bsize == BLOCK_16X16) {
+    return 0;
+  } else if (bsize > bsize_min) {
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
+    // For key frame: take split for bsize above 32X32 or very high variance.
+    if (cm->frame_type == KEY_FRAME &&
+        (bsize > BLOCK_32X32 ||
+        vt.part_variances->none.variance > (threshold << 4))) {
+      return 0;
+    }
+    // If variance is low, take the bsize (no split).
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < (threshold << 6)) {
-      set_block_size(cpi, mi_row, mi_col, bsize);
+        vt.part_variances->none.variance < threshold) {
+      set_block_size(cpi, xd, mi_row, mi_col, bsize);
       return 1;
     }
+
+    // Check vertical split.
+    if (mi_row + block_height / 2 < cm->mi_rows) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
+      get_variance(&vt.part_variances->vert[0]);
+      get_variance(&vt.part_variances->vert[1]);
+      if (vt.part_variances->vert[0].variance < threshold &&
+          vt.part_variances->vert[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, xd, mi_row, mi_col + block_width / 2, subsize);
+        return 1;
+      }
+    }
+    // Check horizontal split.
+    if (mi_col + block_width / 2 < cm->mi_cols) {
+      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
+      get_variance(&vt.part_variances->horz[0]);
+      get_variance(&vt.part_variances->horz[1]);
+      if (vt.part_variances->horz[0].variance < threshold &&
+          vt.part_variances->horz[1].variance < threshold &&
+          get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
+        set_block_size(cpi, xd, mi_row, mi_col, subsize);
+        set_block_size(cpi, xd, mi_row + block_height / 2, mi_col, subsize);
+        return 1;
+      }
+    }
+
+    return 0;
   }
   return 0;
 }
 
-// This function chooses partitioning based on the variance
-// between source and reconstructed last, where variance is
-// computed for 8x8 downsampled inputs. Some things to check:
-// using the last source rather than reconstructed last, and
-// allowing for small downsampling (4x4 or 2x2) for selection
-// of smaller block sizes (i.e., < 16x16).
-static void choose_partitioning(VP9_COMP *cpi,
+void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  if (sf->partition_search_type != VAR_BASED_PARTITION &&
+      sf->partition_search_type != REFERENCE_PARTITION) {
+    return;
+  } else {
+    VP9_COMMON *const cm = &cpi->common;
+    const int is_key_frame = (cm->frame_type == KEY_FRAME);
+    const int threshold_multiplier = is_key_frame ? 20 : 1;
+    const int64_t threshold_base = (int64_t)(threshold_multiplier *
+        cpi->y_dequant[q][1]);
+
+    // TODO(marpan): Allow 4x4 partitions for inter-frames.
+    // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
+    // If 4x4 partition is not used, then 8x8 partition will be selected
+    // if variance of 16x16 block is very high, so use larger threshold
+    // for 16x16 (threshold_bsize_min) in that case.
+
+    // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
+    // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+    if (is_key_frame) {
+      cpi->vbp_thresholds[0] = threshold_base;
+      cpi->vbp_thresholds[1] = threshold_base >> 2;
+      cpi->vbp_thresholds[2] = threshold_base >> 2;
+      cpi->vbp_thresholds[3] = threshold_base << 2;
+      cpi->vbp_threshold_sad = 0;
+      cpi->vbp_bsize_min = BLOCK_8X8;
+    } else {
+      cpi->vbp_thresholds[1] = threshold_base;
+      if (cm->width <= 352 && cm->height <= 288) {
+        cpi->vbp_thresholds[0] = threshold_base >> 2;
+        cpi->vbp_thresholds[2] = threshold_base << 3;
+        cpi->vbp_threshold_sad = 100;
+      } else {
+        cpi->vbp_thresholds[0] = threshold_base;
+        cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2;
+        cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed;
+        cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
+            (cpi->y_dequant[q][1] << 1) : 1000;
+      }
+      cpi->vbp_bsize_min = BLOCK_16X16;
+    }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide,
+                              int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                              d + y8_idx * dp + x8_idx, dp,
+                              &min, &max);
+      } else {
+        vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                       d + y8_idx * dp + x8_idx, dp,
+                       &min, &max);
+      }
+#else
+      vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                     d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+#endif
+      if ((max - min) > minmax_max)
+        minmax_max = (max - min);
+      if ((max - min) < minmax_min)
+        minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int64_t threshold_base = (int64_t)(cpi->y_dequant[q][1]);
+
+  // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
+  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+  thresholds[1] = threshold_base;
+  if (cm->width <= 352 && cm->height <= 288) {
+    thresholds[0] = threshold_base >> 2;
+    thresholds[2] = threshold_base << 3;
+  } else {
+    thresholds[0] = threshold_base;
+    thresholds[1] = (5 * threshold_base) >> 2;
+    thresholds[2] = threshold_base << cpi->oxcf.speed;
+  }
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      } else {
+        s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      }
+#else
+      s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame)
+        d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      } else {
+        s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      }
+#else
+      s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame)
+        d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+static int choose_partitioning(VP9_COMP *cpi,
                                 const TileInfo *const tile,
+                                MACROBLOCK *x,
                                 int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK *x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
-
-  int i, j, k;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int i, j, k, m;
   v64x64 vt;
+  v16x16 vt2[16];
+  int force_split[21];
   uint8_t *s;
   const uint8_t *d;
   int sp;
   int dp;
   int pixels_wide = 64, pixels_high = 64;
-  int_mv nearest_mv, near_mv;
-  const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  const struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+
+  // Always use 4x4 partition for key frame.
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int use_4x4_partition = is_key_frame;
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  int variance4x4downsample[16];
+
+  int segment_id = CR_SEGMENT_ID_BASE;
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
+                                                    cm->last_frame_seg_map;
+    segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+
+    if (cyclic_refresh_segment_id_boosted(segment_id)) {
+      int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+      modify_vbp_thresholds(cpi, thresholds, q);
+    }
+  }
 
-  vp9_clear_system_state();
-  vp9_zero(vt);
-  set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
@@ -483,20 +693,78 @@ static void choose_partitioning(VP9_COMP *cpi,
   s = x->plane[0].src.buf;
   sp = x->plane[0].src.stride;
 
-  if (cm->frame_type != KEY_FRAME) {
-    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, sf);
+  if (!is_key_frame) {
+    MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+    unsigned int uv_sad;
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+    const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    unsigned int y_sad, y_sad_g;
+    const BLOCK_SIZE bsize = BLOCK_32X32
+        + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
+
+    assert(yv12 != NULL);
+    if (yv12_g && yv12_g != yv12) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      y_sad_g = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                       x->plane[0].src.stride,
+                                       xd->plane[0].pre[0].buf,
+                                       xd->plane[0].pre[0].stride);
+    } else {
+      y_sad_g = UINT_MAX;
+    }
+
+    vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                         &cm->frame_refs[LAST_FRAME - 1].sf);
+    mbmi->ref_frame[0] = LAST_FRAME;
+    mbmi->ref_frame[1] = NONE;
+    mbmi->sb_type = BLOCK_64X64;
+    mbmi->mv[0].as_int = 0;
+    mbmi->interp_filter = BILINEAR;
+
+    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+    if (y_sad_g < y_sad) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      mbmi->ref_frame[0] = GOLDEN_FRAME;
+      mbmi->mv[0].as_int = 0;
+      y_sad = y_sad_g;
+    } else {
+      x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
+    }
 
-    xd->mi[0].src_mi->mbmi.ref_frame[0] = LAST_FRAME;
-    xd->mi[0].src_mi->mbmi.sb_type = BLOCK_64X64;
-    vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
-                          xd->mi[0].src_mi->mbmi.ref_mvs[LAST_FRAME],
-                          &nearest_mv, &near_mv);
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
-    xd->mi[0].src_mi->mbmi.mv[0] = nearest_mv;
-    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_64X64);
+    for (i = 1; i <= 2; ++i) {
+      struct macroblock_plane  *p = &x->plane[i];
+      struct macroblockd_plane *pd = &xd->plane[i];
+      const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
+      if (bs == BLOCK_INVALID)
+        uv_sad = UINT_MAX;
+      else
+        uv_sad = cpi->fn_ptr[bs].sdf(p->src.buf, p->src.stride,
+                                     pd->dst.buf, pd->dst.stride);
+
+      x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
+    }
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take 64x64 as partition and exit.
+    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE &&
+        y_sad < cpi->vbp_threshold_sad) {
+      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      if (mi_col + block_width / 2 < cm->mi_cols &&
+          mi_row + block_height / 2 < cm->mi_rows) {
+        set_block_size(cpi, xd, mi_row, mi_col, BLOCK_64X64);
+        return 0;
+      }
+    }
   } else {
     d = VP9_VAR_OFFS;
     dp = 0;
@@ -518,104 +786,188 @@ static void choose_partitioning(VP9_COMP *cpi,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
-  // Fill in the entire tree of 8x8 variances for splits.
+  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
+  force_split[0] = 0;
+  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+  // for splits.
   for (i = 0; i < 4; i++) {
     const int x32_idx = ((i & 1) << 5);
     const int y32_idx = ((i >> 1) << 5);
+    const int i2 = i << 2;
+    force_split[i + 1] = 0;
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
+      const int split_index = 5 + i2 + j;
       v16x16 *vst = &vt.split[i].split[j];
-      for (k = 0; k < 4; k++) {
-        int x_idx = x16_idx + ((k & 1) << 3);
-        int y_idx = y16_idx + ((k >> 1) << 3);
-        unsigned int sse = 0;
-        int sum = 0;
-
-        if (x_idx < pixels_wide && y_idx < pixels_high) {
-          int s_avg, d_avg;
+      force_split[split_index] = 0;
+      variance4x4downsample[i2 + j] = 0;
+      if (!is_key_frame) {
+        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
 #if CONFIG_VP9_HIGHBITDEPTH
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            s_avg = vp9_highbd_avg_8x8(s + y_idx * sp + x_idx, sp);
-            d_avg = vp9_highbd_avg_8x8(d + y_idx * dp + x_idx, dp);
-          } else {
-            s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-            d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+                            xd->cur_buf->flags,
+#endif
+                            pixels_wide,
+                            pixels_high,
+                            is_key_frame);
+        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+        get_variance(&vt.split[i].split[j].part_variances.none);
+        if (vt.split[i].split[j].part_variances.none.variance >
+            thresholds[2]) {
+          // 16X16 variance is above threshold for split, so force split to 8x8
+          // for this 16x16 block (this also forces splits for upper levels).
+          force_split[split_index] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        } else if (vt.split[i].split[j].part_variances.none.variance >
+                   thresholds[1] &&
+                   !cyclic_refresh_segment_id_boosted(segment_id)) {
+          // We have some nominal amount of 16x16 variance (based on average),
+          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+          // force split to 8x8 block for this 16x16 block.
+          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                          xd->cur_buf->flags,
+#endif
+                                          pixels_wide, pixels_high);
+          if (minmax > cpi->vbp_threshold_minmax) {
+            force_split[split_index] = 1;
+            force_split[i + 1] = 1;
+            force_split[0] = 1;
           }
-#else
-          s_avg = vp9_avg_8x8(s + y_idx * sp + x_idx, sp);
-          d_avg = vp9_avg_8x8(d + y_idx * dp + x_idx, dp);
+        }
+      }
+      if (is_key_frame || (low_res &&
+          vt.split[i].split[j].part_variances.none.variance >
+          (thresholds[1] << 1))) {
+        force_split[split_index] = 0;
+        // Go down to 4x4 down-sampling for variance.
+        variance4x4downsample[i2 + j] = 1;
+        for (k = 0; k < 4; k++) {
+          int x8_idx = x16_idx + ((k & 1) << 3);
+          int y8_idx = y16_idx + ((k >> 1) << 3);
+          v8x8 *vst2 = is_key_frame ? &vst->split[k] :
+              &vt2[i2 + j].split[k];
+          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               xd->cur_buf->flags,
 #endif
-          sum = s_avg - d_avg;
-          sse = sum * sum;
+                               pixels_wide,
+                               pixels_high,
+                               is_key_frame);
         }
-        // For an 8x8 block we have just one value the average of all 64
-        // pixels,  so use 1.   This means of course that there is no variance
-        // in an 8x8 block.
-        fill_variance(sse, sum, 1, &vst->split[k].part_variances.none);
       }
     }
   }
+
   // Fill the rest of the variance tree by summing split partition values.
   for (i = 0; i < 4; i++) {
+    const int i2 = i << 2;
     for (j = 0; j < 4; j++) {
-      fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
+      if (variance4x4downsample[i2 + j] == 1) {
+        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
+            &vt.split[i].split[j];
+        for (m = 0; m < 4; m++)
+          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
+        fill_variance_tree(vtemp, BLOCK_16X16);
+      }
     }
     fill_variance_tree(&vt.split[i], BLOCK_32X32);
+    // If variance of this 32x32 block is above the threshold, force the block
+    // to split. This also forces a split on the upper (64x64) level.
+    if (!force_split[i + 1]) {
+      get_variance(&vt.split[i].part_variances.none);
+      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
+        force_split[i + 1] = 1;
+        force_split[0] = 1;
+      }
+    }
+  }
+  if (!force_split[0]) {
+    fill_variance_tree(&vt, BLOCK_64X64);
+    get_variance(&vt.part_variances.none);
   }
-  fill_variance_tree(&vt, BLOCK_64X64);
 
-  // Now go through the entire structure,  splitting every block size until
-  // we get to one that's got a variance lower than our threshold,  or we
-  // hit 8x8.
+  // Now go through the entire structure, splitting every block size until
+  // we get to one that's got a variance lower than our threshold.
   if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
-      !set_vt_partitioning(cpi, &vt, BLOCK_64X64, mi_row, mi_col)) {
+      !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
-      if (!set_vt_partitioning(cpi, &vt.split[i], BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx))) {
+      const int i2 = i << 2;
+      if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
+                               (mi_row + y32_idx), (mi_col + x32_idx),
+                               thresholds[1], BLOCK_16X16,
+                               force_split[i + 1])) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
-          // NOTE: Since this uses 8x8 downsampling for variance calculation
-          // we cannot really select block size 8x8 (or even 8x16/16x8),
-          // since we do not sufficient samples for variance.
-          // For now, 8x8 partition is only set if the variance of the 16x16
-          // block is very high. This is controlled in set_vt_partitioning.
-          if (!set_vt_partitioning(cpi, &vt.split[i].split[j],
-                                   BLOCK_16X16,
+          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
+          // block, then the variance is based on 4x4 down-sampling, so use vt2
+          // in set_vt_partioning(), otherwise use vt.
+          v16x16 *vtemp = (!is_key_frame &&
+                           variance4x4downsample[i2 + j] == 1) ?
+                           &vt2[i2 + j] : &vt.split[i].split[j];
+          if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
                                    mi_row + y32_idx + y16_idx,
-                                   mi_col + x32_idx + x16_idx)) {
+                                   mi_col + x32_idx + x16_idx,
+                                   thresholds[2],
+                                   cpi->vbp_bsize_min,
+                                   force_split[5 + i2  + j])) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              set_block_size(cpi,
-                             (mi_row + y32_idx + y16_idx + y8_idx),
-                             (mi_col + x32_idx + x16_idx + x8_idx),
-                             BLOCK_8X8);
+              if (use_4x4_partition) {
+                if (!set_vt_partitioning(cpi, xd, &vtemp->split[k],
+                                         BLOCK_8X8,
+                                         mi_row + y32_idx + y16_idx + y8_idx,
+                                         mi_col + x32_idx + x16_idx + x8_idx,
+                                         thresholds[3], BLOCK_8X8, 0)) {
+                  set_block_size(cpi, xd,
+                                 (mi_row + y32_idx + y16_idx + y8_idx),
+                                 (mi_col + x32_idx + x16_idx + x8_idx),
+                                 BLOCK_4X4);
+                }
+              } else {
+                set_block_size(cpi, xd,
+                               (mi_row + y32_idx + y16_idx + y8_idx),
+                               (mi_col + x32_idx + x16_idx + x8_idx),
+                               BLOCK_8X8);
+              }
             }
           }
         }
       }
     }
   }
+  return 0;
 }
 
-static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void update_state(VP9_COMP *cpi, ThreadData *td,
+                         PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
                          int output_enabled) {
   int i, x_idx, y;
   VP9_COMMON *const cm = &cpi->common;
-  RD_OPT *const rd_opt = &cpi->rd;
-  MACROBLOCK *const x = &cpi->mb;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   MODE_INFO *mi = &ctx->mic;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
-  MODE_INFO *mi_addr = &xd->mi[0];
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
 
   const int mis = cm->mi_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
@@ -625,10 +977,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   assert(mi->mbmi.sb_type == bsize);
 
   *mi_addr = *mi;
-  mi_addr->src_mi = mi_addr;
 
   // If segmentation in use
-  if (seg->enabled && output_enabled) {
+  if (seg->enabled) {
     // For in frame complexity AQ copy the segment id from the segment map.
     if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
@@ -639,8 +990,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     // Else for cyclic refresh mode update the segment map, set the segment id
     // and then update the quantizer.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-      vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0].src_mi->mbmi,
-                                        mi_row, mi_col, bsize, 1);
+      vp9_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
+                                        mi_col, bsize, ctx->rate, ctx->dist,
+                                        x->skip);
     }
   }
 
@@ -665,7 +1017,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     for (x_idx = 0; x_idx < mi_width; x_idx++)
       if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
         && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
-        xd->mi[x_idx + y * mis].src_mi = mi_addr;
+        xd->mi[x_idx + y * mis] = mi_addr;
       }
 
   if (cpi->oxcf.aq_mode)
@@ -685,15 +1037,15 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   }
 
   x->skip = ctx->skip;
-  vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
-             sizeof(uint8_t) * ctx->num_4x4_blk);
+  memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+         sizeof(uint8_t) * ctx->num_4x4_blk);
 
   if (!output_enabled)
     return;
 
   if (!vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     for (i = 0; i < TX_MODES; i++)
-      rd_opt->tx_select_diff[i] += ctx->tx_rd_diff[i];
+      rdc->tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
 #if CONFIG_INTERNAL_STATS
@@ -718,20 +1070,31 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 #endif
   if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
-      vp9_update_mv_count(cm, xd);
+      vp9_update_mv_count(td);
 
       if (cm->interp_filter == SWITCHABLE) {
         const int ctx = vp9_get_pred_context_switchable_interp(xd);
-        ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
+        ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
       }
     }
 
-    rd_opt->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
-    rd_opt->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
-    rd_opt->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      rd_opt->filter_diff[i] += ctx->best_filter_diff[i];
+      rdc->filter_diff[i] += ctx->best_filter_diff[i];
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
   }
 }
 
@@ -750,16 +1113,16 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                      x->e_mbd.plane[i].subsampling_y);
 }
 
-static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
-                                   int64_t *dist, BLOCK_SIZE bsize) {
+static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   INTERP_FILTER filter_ref;
 
   if (xd->up_available)
-    filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
+    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
   else if (xd->left_available)
-    filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter;
+    filter_ref = xd->mi[-1]->mbmi.interp_filter;
   else
     filter_ref = EIGHTTAP;
 
@@ -774,35 +1137,46 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, int *rate,
   mbmi->mv[0].as_int = 0;
   mbmi->interp_filter = filter_ref;
 
-  xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = 0;
+  xd->mi[0]->bmi[0].as_mv[0].as_int = 0;
   x->skip = 1;
 
-  *rate = 0;
-  *dist = 0;
+  vp9_rd_cost_init(rd_cost);
 }
 
-static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+static int set_segment_rdmult(VP9_COMP *const cpi,
+                               MACROBLOCK *const x,
+                               int8_t segment_id) {
+  int segment_qindex;
+  VP9_COMMON *const cm = &cpi->common;
+  vp9_init_plane_quantizers(cpi, x);
+  vp9_clear_system_state();
+  segment_qindex = vp9_get_qindex(&cm->seg, segment_id,
+                                  cm->base_qindex);
+  return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(VP9_COMP *cpi,
+                             TileDataEnc *tile_data,
+                             MACROBLOCK *const x,
                              int mi_row, int mi_col, RD_COST *rd_cost,
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                              int64_t best_rd) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
   int i, orig_rdmult;
-  double rdmult_ratio;
 
   vp9_clear_system_state();
-  rdmult_ratio = 1.0;  // avoid uninitialized warnings
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
 
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  mbmi = &xd->mi[0].src_mi->mbmi;
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -813,6 +1187,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   }
   ctx->is_coded = 0;
   ctx->skippable = 0;
+  ctx->pred_pixel_ready = 0;
   x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
@@ -821,13 +1196,15 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     x->source_variance =
-        high_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+        vp9_high_get_sby_perpixel_variance(cpi, &x->plane[0].src,
+                                           bsize, xd->bd);
   } else {
     x->source_variance =
-        get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+      vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
   }
 #else
-  x->source_variance = get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+  x->source_variance =
+    vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Save rdmult before it might be changed, so it can be restored later.
@@ -845,23 +1222,15 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
                                                     : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-
-    rdmult_ratio = vp9_vaq_rdmult_ratio(energy);
-    vp9_init_plane_quantizers(cpi, x);
-    vp9_clear_system_state();
-    x->rdmult = (int)round(x->rdmult * rdmult_ratio);
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == COMPLEXITY_AQ) {
-    const int mi_offset = mi_row * cm->mi_cols + mi_col;
-    unsigned char complexity = cpi->complexity_map[mi_offset];
-    const int is_edge = (mi_row <= 1) || (mi_row >= (cm->mi_rows - 2)) ||
-                        (mi_col <= 1) || (mi_col >= (cm->mi_cols - 2));
-    if (!is_edge && (complexity > 128))
-      x->rdmult += ((x->rdmult * (complexity - 128)) / 256);
+    x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == CYCLIC_REFRESH_AQ) {
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
                                                   : cm->last_frame_seg_map;
-    // If segment 1, use rdmult for that segment.
-    if (vp9_get_segment_id(cm, map, bsize, mi_row, mi_col))
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(
+            vp9_get_segment_id(cm, map, bsize, mi_row, mi_col)))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
@@ -872,21 +1241,25 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   } else {
     if (bsize >= BLOCK_8X8) {
       if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, x, rd_cost, bsize,
+        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
-        vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+        vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
                                   rd_cost, bsize, ctx, best_rd);
     } else {
-      vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, rd_cost,
-                                    bsize, ctx, best_rd);
+      vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                    rd_cost, bsize, ctx, best_rd);
     }
   }
 
-  if (aq_mode == VARIANCE_AQ && rd_cost->rate != INT_MAX) {
-    vp9_clear_system_state();
-    rd_cost->rate = (int)round(rd_cost->rate * rdmult_ratio);
-    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+
+  // Examine the resulting rate and for AQ mode 2 make a segment choice.
+  if ((rd_cost->rate != INT_MAX) &&
+      (aq_mode == COMPLEXITY_AQ) && (bsize >= BLOCK_16X16) &&
+      (cm->frame_type == KEY_FRAME ||
+       cpi->refresh_alt_ref_frame ||
+       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+    vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
   }
 
   x->rdmult = orig_rdmult;
@@ -895,28 +1268,30 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   // refactored to provide proper exit/return handle.
   if (rd_cost->rate == INT_MAX)
     rd_cost->rdcost = INT64_MAX;
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
 }
 
-static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) {
+static void update_stats(VP9_COMMON *cm, ThreadData *td) {
+  const MACROBLOCK *x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const MODE_INFO *const mi = xd->mi[0].src_mi;
+  const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
   if (!frame_is_intra_only(cm)) {
+    FRAME_COUNTS *const counts = td->counts;
+    const int inter_block = is_inter_block(mbmi);
     const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                                      SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
-      FRAME_COUNTS *const counts = &cm->counts;
-      const int inter_block = is_inter_block(mbmi);
-
       counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++;
-
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
         const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
-
         if (cm->reference_mode == REFERENCE_MODE_SELECT)
           counts->comp_inter[vp9_get_reference_mode_context(cm, xd)]
                             [has_second_ref(mbmi)]++;
@@ -933,15 +1308,33 @@ static void update_stats(VP9_COMMON *cm, const MACROBLOCK *x) {
         }
       }
     }
+    if (inter_block &&
+        !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
+      if (bsize >= BLOCK_8X8) {
+        const PREDICTION_MODE mode = mbmi->mode;
+        ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+      } else {
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < 2; idy += num_4x4_h) {
+          for (idx = 0; idx < 2; idx += num_4x4_w) {
+            const int j = idy * 2 + idx;
+            const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+            ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+          }
+        }
+      }
+    }
   }
 }
 
-static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                             BLOCK_SIZE bsize) {
-  MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -949,30 +1342,29 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
   int mi_width = num_8x8_blocks_wide_lookup[bsize];
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
-    vpx_memcpy(
+    memcpy(
         xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
         a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
-    vpx_memcpy(
+    memcpy(
         xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(xd->above_seg_context + mi_col, sa,
-             sizeof(*xd->above_seg_context) * mi_width);
-  vpx_memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
-             sizeof(xd->left_seg_context[0]) * mi_height);
+  memcpy(xd->above_seg_context + mi_col, sa,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+         sizeof(xd->left_seg_context[0]) * mi_height);
 }
 
-static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
+static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
                          ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                          BLOCK_SIZE bsize) {
-  const MACROBLOCK *const x = &cpi->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -982,46 +1374,49 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
 
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
-    vpx_memcpy(
+    memcpy(
         a + num_4x4_blocks_wide * p,
         xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
-    vpx_memcpy(
+    memcpy(
         l + num_4x4_blocks_high * p,
         xd->left_context[p]
             + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  vpx_memcpy(sa, xd->above_seg_context + mi_col,
-             sizeof(*xd->above_seg_context) * mi_width);
-  vpx_memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
-             sizeof(xd->left_seg_context[0]) * mi_height);
+  memcpy(sa, xd->above_seg_context + mi_col,
+         sizeof(*xd->above_seg_context) * mi_width);
+  memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+         sizeof(xd->left_seg_context[0]) * mi_height);
 }
 
 static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+                     ThreadData *td,
                      TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize,
                      PICK_MODE_CONTEXT *ctx) {
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  update_state(cpi, ctx, mi_row, mi_col, bsize, output_enabled);
-  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
+  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
   if (output_enabled) {
-    update_stats(&cpi->common, &cpi->mb);
+    update_stats(&cpi->common, td);
 
     (*tp)->token = EOSB_TOKEN;
     (*tp)++;
   }
 }
 
-static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_sb(VP9_COMP *cpi, ThreadData *td,
+                      const TileInfo *const tile,
                       TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize,
                       PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
@@ -1042,46 +1437,46 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
 
   partition = partition_lookup[bsl][subsize];
   if (output_enabled && bsize != BLOCK_4X4)
-    cm->counts.partition[ctx][partition]++;
+    td->counts->partition[ctx][partition]++;
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                &pc_tree->none);
       break;
     case PARTITION_VERT:
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                &pc_tree->vertical[0]);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
-                 &pc_tree->vertical[1]);
+        encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
+                 subsize, &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
-      encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                &pc_tree->horizontal[0]);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
-                 &pc_tree->horizontal[1]);
+        encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
+                 subsize, &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+        encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                  pc_tree->leaf_split[0]);
       } else {
-        encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   pc_tree->split[0]);
-        encode_sb(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled, subsize,
-                  pc_tree->split[1]);
-        encode_sb(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled, subsize,
-                  pc_tree->split[2]);
-        encode_sb(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+        encode_sb(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
+                  subsize, pc_tree->split[1]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
+                  subsize, pc_tree->split[2]);
+        encode_sb(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
                   subsize, pc_tree->split[3]);
       }
       break;
     default:
-      assert("Invalid partition type.");
+      assert(0 && "Invalid partition type.");
       break;
   }
 
@@ -1111,15 +1506,15 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
 
 static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
     int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining,
-    BLOCK_SIZE bsize, MODE_INFO *mi_8x8) {
+    BLOCK_SIZE bsize, MODE_INFO **mi_8x8) {
   int bh = bh_in;
   int r, c;
   for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
     int bw = bw_in;
     for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
       const int index = r * mis + c;
-      mi_8x8[index].src_mi = mi + index;
-      mi_8x8[index].src_mi->mbmi.sb_type = find_partition_size(bsize,
+      mi_8x8[index] = mi + index;
+      mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize,
           row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
     }
   }
@@ -1131,7 +1526,7 @@ static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
-                                   MODE_INFO *mi_8x8, int mi_row, int mi_col,
+                                   MODE_INFO **mi_8x8, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   const int mis = cm->mi_stride;
@@ -1150,8 +1545,8 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
       for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
         int index = block_row * mis + block_col;
-        mi_8x8[index].src_mi = mi_upper_left + index;
-        mi_8x8[index].src_mi->mbmi.sb_type = bsize;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = bsize;
       }
     }
   } else {
@@ -1161,79 +1556,6 @@ static void set_fixed_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void copy_partitioning(VP9_COMMON *cm, MODE_INFO *mi_8x8,
-  MODE_INFO *prev_mi_8x8) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  for (block_row = 0; block_row < 8; ++block_row) {
-    for (block_col = 0; block_col < 8; ++block_col) {
-      MODE_INFO *const prev_mi =
-          prev_mi_8x8[block_row * mis + block_col].src_mi;
-      const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-
-      if (prev_mi) {
-        const ptrdiff_t offset = prev_mi - cm->prev_mi;
-        mi_8x8[block_row * mis + block_col].src_mi = cm->mi + offset;
-        mi_8x8[block_row * mis + block_col].src_mi->mbmi.sb_type = sb_type;
-      }
-    }
-  }
-}
-
-static void constrain_copy_partitioning(VP9_COMP *const cpi,
-                                        const TileInfo *const tile,
-                                        MODE_INFO *mi_8x8,
-                                        MODE_INFO *prev_mi_8x8,
-                                        int mi_row, int mi_col,
-                                        BLOCK_SIZE bsize) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
-  MODE_INFO *const mi_upper_left = cm->mi + mi_row * mis + mi_col;
-  const int bh = num_8x8_blocks_high_lookup[bsize];
-  const int bw = num_8x8_blocks_wide_lookup[bsize];
-  int block_row, block_col;
-
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
-
-  // If the SB64 if it is all "in image".
-  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
-      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
-        const int index = block_row * mis + block_col;
-        MODE_INFO *prev_mi = prev_mi_8x8[index].src_mi;
-        const BLOCK_SIZE sb_type = prev_mi ? prev_mi->mbmi.sb_type : 0;
-        // Use previous partition if block size is not larger than bsize.
-        if (prev_mi && sb_type <= bsize) {
-          int block_row2, block_col2;
-          for (block_row2 = 0; block_row2 < bh; ++block_row2) {
-            for (block_col2 = 0; block_col2 < bw; ++block_col2) {
-              const int index2 = (block_row + block_row2) * mis +
-                  block_col + block_col2;
-              prev_mi = prev_mi_8x8[index2].src_mi;
-              if (prev_mi) {
-                const ptrdiff_t offset = prev_mi - cm->prev_mi;
-                mi_8x8[index2].src_mi = cm->mi + offset;
-                mi_8x8[index2].src_mi->mbmi.sb_type = prev_mi->mbmi.sb_type;
-              }
-            }
-          }
-        } else {
-          // Otherwise, use fixed partition of size bsize.
-          mi_8x8[index].src_mi = mi_upper_left + index;
-          mi_8x8[index].src_mi->mbmi.sb_type = bsize;
-        }
-      }
-    }
-  } else {
-    // Else this is a partial SB64, copy previous partition.
-    copy_partitioning(cm, mi_8x8, prev_mi_8x8);
-  }
-}
-
 const struct {
   int row;
   int col;
@@ -1250,10 +1572,10 @@ const struct {
 
 static void set_source_var_based_partition(VP9_COMP *cpi,
                                            const TileInfo *const tile,
-                                           MODE_INFO *mi_8x8,
+                                           MACROBLOCK *const x,
+                                           MODE_INFO **mi_8x8,
                                            int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
   const int mis = cm->mi_stride;
   const int row8x8_remaining = tile->mi_row_end - mi_row;
   const int col8x8_remaining = tile->mi_col_end - mi_col;
@@ -1274,7 +1596,7 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
     int use32x32 = 0;
     unsigned int thr = cpi->source_var_thresh;
 
-    vpx_memset(d32, 0, 4 * sizeof(diff));
+    memset(d32, 0, 4 * sizeof(diff));
 
     for (i = 0; i < 4; i++) {
       diff *d16[4];
@@ -1288,8 +1610,8 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
         d16[j] = cpi->source_diff_var + offset + boffset;
 
         index = b_mi_row * mis + b_mi_col;
-        mi_8x8[index].src_mi = mi_upper_left + index;
-        mi_8x8[index].src_mi->mbmi.sb_type = BLOCK_16X16;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = BLOCK_16X16;
 
         // TODO(yunqingwang): If d16[j].var is very large, use 8x8 partition
         // size to further improve quality.
@@ -1310,8 +1632,8 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
         d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10);
 
         index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col;
-        mi_8x8[index].src_mi = mi_upper_left + index;
-        mi_8x8[index].src_mi->mbmi.sb_type = BLOCK_32X32;
+        mi_8x8[index] = mi_upper_left + index;
+        mi_8x8[index]->mbmi.sb_type = BLOCK_32X32;
       }
     }
 
@@ -1322,8 +1644,8 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
 
       // Use 64x64 partition
       if (is_larger_better) {
-        mi_8x8[0].src_mi = mi_upper_left;
-        mi_8x8[0].src_mi->mbmi.sb_type = BLOCK_64X64;
+        mi_8x8[0] = mi_upper_left;
+        mi_8x8[0]->mbmi.sb_type = BLOCK_64X64;
       }
     }
   } else {   // partial in-image SB64
@@ -1334,67 +1656,21 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
   }
 }
 
-static int is_background(const VP9_COMP *cpi, const TileInfo *const tile,
-                         int mi_row, int mi_col) {
-  // This assumes the input source frames are of the same dimension.
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
-  const int x = mi_col * MI_SIZE;
-  const int y = mi_row * MI_SIZE;
-  const int src_stride = cpi->Source->y_stride;
-  const uint8_t *const src = &cpi->Source->y_buffer[y * src_stride + x];
-  const int pre_stride = cpi->Last_Source->y_stride;
-  const uint8_t *const pre = &cpi->Last_Source->y_buffer[y * pre_stride + x];
-  int this_sad = 0;
-  int threshold = 0;
-
-  if (row8x8_remaining >= MI_BLOCK_SIZE &&
-      col8x8_remaining >= MI_BLOCK_SIZE) {
-    this_sad = cpi->fn_ptr[BLOCK_64X64].sdf(src, src_stride, pre, pre_stride);
-    threshold = (1 << 12);
-  } else {
-    int r, c;
-    for (r = 0; r < row8x8_remaining; r += 2)
-      for (c = 0; c < col8x8_remaining; c += 2)
-        this_sad += cpi->fn_ptr[BLOCK_16X16].sdf(src, src_stride,
-                                                 pre, pre_stride);
-    threshold = (row8x8_remaining * col8x8_remaining) << 6;
-  }
-
-  return this_sad < 2 * threshold;
-}
-
-static int sb_has_motion(const VP9_COMMON *cm, MODE_INFO *prev_mi_8x8,
-                         const int motion_thresh) {
-  const int mis = cm->mi_stride;
-  int block_row, block_col;
-
-  if (cm->prev_mi) {
-    for (block_row = 0; block_row < 8; ++block_row) {
-      for (block_col = 0; block_col < 8; ++block_col) {
-        const MODE_INFO *prev_mi =
-            prev_mi_8x8[block_row * mis + block_col].src_mi;
-        if (prev_mi) {
-          if (abs(prev_mi->mbmi.mv[0].as_mv.row) > motion_thresh ||
-              abs(prev_mi->mbmi.mv[0].as_mv.col) > motion_thresh)
-            return 1;
-        }
-      }
-    }
-  }
-  return 0;
-}
-
-static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
+                            PICK_MODE_CONTEXT *ctx,
                             int mi_row, int mi_col, int bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *const seg = &cm->seg;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
 
-  *(xd->mi[0].src_mi) = ctx->mic;
-  xd->mi[0].src_mi = &xd->mi[0];
+  *(xd->mi[0]) = ctx->mic;
 
   if (seg->enabled && cpi->oxcf.aq_mode) {
     // For in frame complexity AQ or variance AQ, copy segment_id from
@@ -1405,18 +1681,40 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                                                  : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
     } else {
-    // Setting segmentation map for cyclic_refresh
-      vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize, 1);
+    // Setting segmentation map for cyclic_refresh.
+      vp9_cyclic_refresh_update_segment(cpi, mbmi, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip);
     }
     vp9_init_plane_quantizers(cpi, x);
   }
 
   if (is_inter_block(mbmi)) {
-    vp9_update_mv_count(cm, xd);
-
+    vp9_update_mv_count(td);
     if (cm->interp_filter == SWITCHABLE) {
       const int pred_ctx = vp9_get_pred_context_switchable_interp(xd);
-      ++cm->counts.switchable_interp[pred_ctx][mbmi->interp_filter];
+      ++td->counts->switchable_interp[pred_ctx][mbmi->interp_filter];
+    }
+
+    if (mbmi->sb_type < BLOCK_8X8) {
+      mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+      mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+    }
+  }
+
+  if (cm->use_prev_frame_mvs) {
+    MV_REF *const frame_mvs =
+        cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+    int w, h;
+
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+        mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+        mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+        mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+      }
     }
   }
 
@@ -1424,33 +1722,37 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   x->skip_txfm[0] = mbmi->segment_id ? 0 : ctx->skip_txfm[0];
 }
 
-static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
+                        const TileInfo *const tile,
                         TOKENEXTRA **tp, int mi_row, int mi_col,
-                     int output_enabled, BLOCK_SIZE bsize,
-                     PICK_MODE_CONTEXT *ctx) {
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  update_state_rt(cpi, ctx, mi_row, mi_col, bsize);
+                        int output_enabled, BLOCK_SIZE bsize,
+                        PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCK *const x = &td->mb;
+  set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+  update_state_rt(cpi, td, ctx, mi_row, mi_col, bsize);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && output_enabled) {
-    vp9_denoiser_denoise(&cpi->denoiser, &cpi->mb, mi_row, mi_col,
+  if (cpi->oxcf.noise_sensitivity > 0 && output_enabled &&
+      cpi->common.frame_type != KEY_FRAME) {
+    vp9_denoiser_denoise(&cpi->denoiser, x, mi_row, mi_col,
                          MAX(BLOCK_8X8, bsize), ctx);
   }
 #endif
 
-  encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize, ctx);
-  update_stats(&cpi->common, &cpi->mb);
+  encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
+  update_stats(&cpi->common, td);
 
   (*tp)->token = EOSB_TOKEN;
   (*tp)++;
 }
 
-static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
+static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
+                         const TileInfo *const tile,
                          TOKENEXTRA **tp, int mi_row, int mi_col,
                          int output_enabled, BLOCK_SIZE bsize,
                          PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
   const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
@@ -1463,9 +1765,9 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
 
   if (bsize >= BLOCK_8X8) {
     const int idx_str = xd->mi_stride * mi_row + mi_col;
-    MODE_INFO *mi_8x8 = cm->mi[idx_str].src_mi;
+    MODE_INFO ** mi_8x8 = cm->mi_grid_visible + idx_str;
     ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = mi_8x8[0].src_mi->mbmi.sb_type;
+    subsize = mi_8x8[0]->mbmi.sb_type;
   } else {
     ctx = 0;
     subsize = BLOCK_4X4;
@@ -1473,42 +1775,42 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
 
   partition = partition_lookup[bsl][subsize];
   if (output_enabled && bsize != BLOCK_4X4)
-    cm->counts.partition[ctx][partition]++;
+    td->counts->partition[ctx][partition]++;
 
   switch (partition) {
     case PARTITION_NONE:
-      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   &pc_tree->none);
       break;
     case PARTITION_VERT:
-      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   &pc_tree->vertical[0]);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
-        encode_b_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+        encode_b_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
                     subsize, &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
-      encode_b_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_b_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                   &pc_tree->horizontal[0]);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
-        encode_b_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+        encode_b_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
                     subsize, &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
-      encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, subsize,
+      encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                    pc_tree->split[0]);
-      encode_sb_rt(cpi, tile, tp, mi_row, mi_col + hbs, output_enabled,
+      encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col + hbs, output_enabled,
                    subsize, pc_tree->split[1]);
-      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col, output_enabled,
+      encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col, output_enabled,
                    subsize, pc_tree->split[2]);
-      encode_sb_rt(cpi, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled,
-                   subsize, pc_tree->split[3]);
+      encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
+                   output_enabled, subsize, pc_tree->split[3]);
       break;
     default:
-      assert("Invalid partition type.");
+      assert(0 && "Invalid partition type.");
       break;
   }
 
@@ -1516,13 +1818,17 @@ static void encode_sb_rt(VP9_COMP *cpi, const TileInfo *const tile,
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
-                             MODE_INFO *mi_8x8, TOKENEXTRA **tp,
+static void rd_use_partition(VP9_COMP *cpi,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             MODE_INFO **mi_8x8, TOKENEXTRA **tp,
                              int mi_row, int mi_col,
-                             BLOCK_SIZE bsize, int *rate, int64_t *dist,
+                             BLOCK_SIZE bsize,
+                             int *rate, int64_t *dist,
                              int do_recon, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mis = cm->mi_stride;
   const int bsl = b_width_log2_lookup[bsize];
@@ -1536,7 +1842,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE bs_type = mi_8x8[0].src_mi->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
 
@@ -1554,10 +1860,10 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
   subsize = get_subsize(bsize, partition);
 
   pc_tree->partitioning = partition;
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
-    set_offsets(cpi, tile, mi_row, mi_col, bsize);
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
   }
 
@@ -1570,7 +1876,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss].src_mi;
+        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
         if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
           splits_below = 0;
         }
@@ -1583,7 +1889,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
         mi_row + (mi_step >> 1) < cm->mi_rows &&
         mi_col + (mi_step >> 1) < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rdc, bsize,
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
                        ctx, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -1594,19 +1900,19 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
                                  none_rdc.dist);
       }
 
-      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
-      mi_8x8[0].src_mi->mbmi.sb_type = bs_type;
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      mi_8x8[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
   }
 
   switch (partition) {
     case PARTITION_NONE:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                        bsize, ctx, INT64_MAX);
       break;
     case PARTITION_HORZ:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                        subsize, &pc_tree->horizontal[0],
                        INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
@@ -1614,9 +1920,10 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
         RD_COST tmp_rdc;
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
                          subsize, &pc_tree->horizontal[1], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
@@ -1628,16 +1935,17 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
       break;
     case PARTITION_VERT:
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
           bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
         RD_COST tmp_rdc;
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        rd_pick_sb_modes(cpi, tile_data, x,
+                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
                          subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
                          INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1651,7 +1959,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
-        rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rdc,
+        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
                          subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
@@ -1667,7 +1975,8 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
           continue;
 
         vp9_rd_cost_init(&tmp_rdc);
-        rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
+        rd_use_partition(cpi, td, tile_data,
+                         mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist,
                          i != 3, pc_tree->split[i]);
@@ -1702,7 +2011,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
@@ -1716,12 +2025,13 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+      rd_pick_sb_modes(cpi, tile_data, x,
+                       mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
                        split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
-      restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
         vp9_rd_cost_reset(&chosen_rdc);
@@ -1732,7 +2042,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
       chosen_rdc.dist += tmp_rdc.dist;
 
       if (i != 3)
-        encode_sb(cpi, tile, tp,  mi_row + y_idx, mi_col + x_idx, 0,
+        encode_sb(cpi, td, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize, pc_tree->split[i]);
 
       pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
@@ -1749,7 +2059,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   // If last_part is better set the partitioning to that.
   if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mi_8x8[0].src_mi->mbmi.sb_type = bsize;
+    mi_8x8[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
@@ -1761,7 +2071,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
     chosen_rdc = none_rdc;
   }
 
-  restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
@@ -1770,19 +2080,7 @@ static void rd_use_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   if (do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col,
-                                    output_enabled, chosen_rdc.rate);
-    }
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              chosen_rdc.rate, chosen_rdc.dist);
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize,
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
   }
 
@@ -1813,7 +2111,7 @@ static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
 //
 // The min and max are assumed to have been initialized prior to calling this
 // function so repeat calls can accumulate a min and max of more than one sb64.
-static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO *mi_8x8,
+static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
                                         BLOCK_SIZE *min_block_size,
                                         BLOCK_SIZE *max_block_size,
                                         int bs_hist[BLOCK_SIZES]) {
@@ -1825,7 +2123,7 @@ static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO *mi_8x8,
   // Check the sb_type for each block that belongs to this region.
   for (i = 0; i < sb_height_in_blocks; ++i) {
     for (j = 0; j < sb_width_in_blocks; ++j) {
-      MODE_INFO *mi = mi_8x8[index+j].src_mi;
+      MODE_INFO *mi = mi_8x8[index+j];
       BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
       bs_hist[sb_type]++;
       *min_block_size = MIN(*min_block_size, sb_type);
@@ -1847,20 +2145,19 @@ static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                    MACROBLOCKD *const xd,
                                     int mi_row, int mi_col,
                                     BLOCK_SIZE *min_block_size,
                                     BLOCK_SIZE *max_block_size) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO *mi = xd->mi[0].src_mi;
-  const int left_in_image = xd->left_available && mi[-1].src_mi;
-  const int above_in_image = xd->up_available && mi[-xd->mi_stride].src_mi;
+  MODE_INFO **mi = xd->mi;
+  const int left_in_image = xd->left_available && mi[-1];
+  const int above_in_image = xd->up_available && mi[-xd->mi_stride];
   const int row8x8_remaining = tile->mi_row_end - mi_row;
   const int col8x8_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_4X4;
   BLOCK_SIZE max_size = BLOCK_64X64;
-  int i = 0;
   int bs_hist[BLOCK_SIZES] = {0};
 
   // Trap case where we do not have a prediction.
@@ -1873,54 +2170,27 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
     // passed in values for min and max as a starting point.
     // Find the min and max partition used in previous frame at this location
     if (cm->frame_type != KEY_FRAME) {
-      MODE_INFO *prev_mi =
-          cm->prev_mip + cm->mi_stride + 1 + mi_row * xd->mi_stride + mi_col;
-
+      MODE_INFO **prev_mi =
+          &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
       get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
     }
     // Find the min and max partition sizes used in the left SB64
     if (left_in_image) {
-      MODE_INFO *left_sb64_mi = mi[-MI_BLOCK_SIZE].src_mi;
+      MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
       get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
                                   bs_hist);
     }
     // Find the min and max partition sizes used in the above SB64.
     if (above_in_image) {
-      MODE_INFO *above_sb64_mi = mi[-xd->mi_stride * MI_BLOCK_SIZE].src_mi;
+      MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
       get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
                                   bs_hist);
     }
 
-    // adjust observed min and max
+    // Adjust observed min and max for "relaxed" auto partition case.
     if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
       min_size = min_partition_size[min_size];
       max_size = max_partition_size[max_size];
-    } else if (cpi->sf.auto_min_max_partition_size ==
-               CONSTRAIN_NEIGHBORING_MIN_MAX) {
-      // adjust the search range based on the histogram of the observed
-      // partition sizes from left, above the previous co-located blocks
-      int sum = 0;
-      int first_moment = 0;
-      int second_moment = 0;
-      int var_unnormalized = 0;
-
-      for (i = 0; i < BLOCK_SIZES; i++) {
-        sum += bs_hist[i];
-        first_moment += bs_hist[i] * i;
-        second_moment += bs_hist[i] * i * i;
-      }
-
-      // if variance is small enough,
-      // adjust the range around its mean size, which gives a tighter range
-      var_unnormalized = second_moment - first_moment * first_moment / sum;
-      if (var_unnormalized <= 4 * sum) {
-        int mean = first_moment / sum;
-        min_size = min_partition_size[mean];
-        max_size = max_partition_size[mean];
-      } else {
-        min_size = min_partition_size[min_size];
-        max_size = max_partition_size[max_size];
-      }
     }
   }
 
@@ -1928,7 +2198,7 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
   max_size = find_partition_size(max_size,
                                  row8x8_remaining, col8x8_remaining,
                                  &bh, &bw);
-  min_size = MIN(min_size, max_size);
+  min_size = MIN(cpi->sf.rd_auto_partition_min_limit, MIN(min_size, max_size));
 
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
@@ -1943,15 +2213,14 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
 }
 
 static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+                                 MACROBLOCKD *const xd,
                                  int mi_row, int mi_col,
                                  BLOCK_SIZE *min_block_size,
                                  BLOCK_SIZE *max_block_size) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO *mi_8x8 = xd->mi;
-  const int left_in_image = xd->left_available && mi_8x8[-1].src_mi;
-  const int above_in_image = xd->up_available &&
-                             mi_8x8[-xd->mi_stride].src_mi;
+  MODE_INFO **mi_8x8 = xd->mi;
+  const int left_in_image = xd->left_available && mi_8x8[-1];
+  const int above_in_image = xd->up_available && mi_8x8[-xd->mi_stride];
   int row8x8_remaining = tile->mi_row_end - mi_row;
   int col8x8_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
@@ -1964,15 +2233,15 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
   if (search_range_ctrl &&
       (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
     int block;
-    MODE_INFO *mi;
+    MODE_INFO **mi;
     BLOCK_SIZE sb_type;
 
     // Find the min and max partition sizes used in the left SB64.
     if (left_in_image) {
       MODE_INFO *cur_mi;
-      mi = mi_8x8[-1].src_mi;
+      mi = &mi_8x8[-1];
       for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        cur_mi = mi[block * xd->mi_stride].src_mi;
+        cur_mi = mi[block * xd->mi_stride];
         sb_type = cur_mi ? cur_mi->mbmi.sb_type : 0;
         min_size = MIN(min_size, sb_type);
         max_size = MAX(max_size, sb_type);
@@ -1980,9 +2249,9 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
     }
     // Find the min and max partition sizes used in the above SB64.
     if (above_in_image) {
-      mi = mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE].src_mi;
+      mi = &mi_8x8[-xd->mi_stride * MI_BLOCK_SIZE];
       for (block = 0; block < MI_BLOCK_SIZE; ++block) {
-        sb_type = mi[block].src_mi ? mi[block].src_mi->mbmi.sb_type : 0;
+        sb_type = mi[block] ? mi[block]->mbmi.sb_type : 0;
         min_size = MIN(min_size, sb_type);
         max_size = MAX(max_size, sb_type);
       }
@@ -2013,9 +2282,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   MODE_INFO *mi;
   const int idx_str = cm->mi_stride * mi_row + mi_col;
-  MODE_INFO *prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
-
-
+  MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
   BLOCK_SIZE bs, min_size, max_size;
 
   min_size = BLOCK_64X64;
@@ -2024,7 +2291,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
   if (prev_mi) {
     for (idy = 0; idy < mi_height; ++idy) {
       for (idx = 0; idx < mi_width; ++idx) {
-        mi = prev_mi[idy * cm->mi_stride + idx].src_mi;
+        mi = prev_mi[idy * cm->mi_stride + idx];
         bs = mi ? mi->mbmi.sb_type : bsize;
         min_size = MIN(min_size, bs);
         max_size = MAX(max_size, bs);
@@ -2034,7 +2301,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   if (xd->left_available) {
     for (idy = 0; idy < mi_height; ++idy) {
-      mi = xd->mi[idy * cm->mi_stride - 1].src_mi;
+      mi = xd->mi[idy * cm->mi_stride - 1];
       bs = mi ? mi->mbmi.sb_type : bsize;
       min_size = MIN(min_size, bs);
       max_size = MAX(max_size, bs);
@@ -2043,7 +2310,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   if (xd->up_available) {
     for (idx = 0; idx < mi_width; ++idx) {
-      mi = xd->mi[idx - cm->mi_stride].src_mi;
+      mi = xd->mi[idx - cm->mi_stride];
       bs = mi ? mi->mbmi.sb_type : bsize;
       min_size = MIN(min_size, bs);
       max_size = MAX(max_size, bs);
@@ -2060,11 +2327,11 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+  memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
 }
 
 static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+  memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
 }
 
 #if CONFIG_FP_MB_STATS
@@ -2115,12 +2382,14 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                              TileDataEnc *tile_data,
                               TOKENEXTRA **tp, int mi_row, int mi_col,
                               BLOCK_SIZE bsize, RD_COST *rd_cost,
                               int64_t best_rd, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
@@ -2139,8 +2408,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   const int xss = x->e_mbd.plane[1].subsampling_x;
   const int yss = x->e_mbd.plane[1].subsampling_y;
 
-  BLOCK_SIZE min_size = cpi->sf.min_partition_size;
-  BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+  BLOCK_SIZE min_size = x->min_partition_size;
+  BLOCK_SIZE max_size = x->max_partition_size;
 
 #if CONFIG_FP_MB_STATS
   unsigned int src_diff_var = UINT_MAX;
@@ -2162,7 +2431,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   vp9_rd_cost_reset(&best_rdc);
   best_rdc.rdcost = best_rd;
 
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
   if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
     x->mb_energy = vp9_block_energy(cpi, x, bsize);
@@ -2190,12 +2459,12 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     partition_vert_allowed &= force_vert_split;
   }
 
-  save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
-    set_offsets(cpi, tile, mi_row, mi_col, bsize);
-    src_diff_var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+    set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+    src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
                                                   mi_row, mi_col, bsize);
   }
 #endif
@@ -2253,8 +2522,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rdc, bsize, ctx,
-                     best_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+                     &this_rdc, bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
         pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2323,9 +2592,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           }
           if (skip) {
             if (src_diff_var == UINT_MAX) {
-              set_offsets(cpi, tile, mi_row, mi_col, bsize);
+              set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
               src_diff_var = get_sby_perpixel_diff_variance(
-                  cpi, &cpi->mb.plane[0].src, mi_row, mi_col, bsize);
+                  cpi, &x->plane[0].src, mi_row, mi_col, bsize);
             }
             if (src_diff_var < 8) {
               do_split = 0;
@@ -2336,7 +2605,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 #endif
       }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
   // store estimated motion vector
@@ -2353,7 +2622,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                        pc_tree->leaf_split[0], best_rdc.rdcost);
       if (sum_rdc.rate == INT_MAX)
         sum_rdc.rdcost = INT64_MAX;
@@ -2369,7 +2638,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           load_pred_mv(x, ctx);
 
         pc_tree->split[i]->index = i;
-        rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
+        rd_pick_partition(cpi, td, tile_data, tp,
+                          mi_row + y_idx, mi_col + x_idx,
                           subsize, &this_rdc,
                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
@@ -2400,7 +2670,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       if (cpi->sf.less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
   // PARTITION_HORZ
@@ -2412,14 +2682,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                      &pc_tree->horizontal[0], best_rdc.rdcost);
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize, ctx);
+      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
 
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, ctx);
@@ -2427,8 +2697,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row + mi_step, mi_col, &this_rdc,
-                       subsize, &pc_tree->horizontal[1],
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
+                       &this_rdc, subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -2448,7 +2718,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
   // PARTITION_VERT
   if (partition_vert_allowed && do_rect) {
@@ -2460,12 +2730,12 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rdc, subsize,
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                      &pc_tree->vertical[0], best_rdc.rdcost);
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
         bsize > BLOCK_8X8) {
-      update_state(cpi, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize,
+      update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
                         &pc_tree->vertical[0]);
 
       if (cpi->sf.adaptive_motion_search)
@@ -2474,7 +2744,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile, mi_row, mi_col + mi_step, &this_rdc, subsize,
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
+                       &this_rdc, subsize,
                        &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
@@ -2495,7 +2766,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
-    restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
   // TODO(jbb): This code added so that we avoid static analysis
@@ -2509,18 +2780,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map)
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
-                                    best_rdc.rate);
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              best_rdc.rate, best_rdc.dist);
-
-    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+              bsize, pc_tree);
   }
 
   if (bsize == BLOCK_64X64) {
@@ -2532,104 +2793,93 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   }
 }
 
-static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                             int mi_row, TOKENEXTRA **tp) {
+static void encode_rd_sb_row(VP9_COMP *cpi,
+                             ThreadData *td,
+                             TileDataEnc *tile_data,
+                             int mi_row,
+                             TOKENEXTRA **tp) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
-  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+  memset(&xd->left_context, 0, sizeof(xd->left_context));
+  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
+    const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
     RD_COST dummy_rdc;
     int i;
+    int seg_skip = 0;
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
-    MODE_INFO *mi = cm->mi + idx_str;
-    MODE_INFO *prev_mi = NULL;
-
-    if (cm->frame_type != KEY_FRAME)
-      prev_mi = (cm->prev_mip + cm->mi_stride + 1 + idx_str)->src_mi;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
 
     if (sf->adaptive_pred_interp_filter) {
       for (i = 0; i < 64; ++i)
-        cpi->leaf_tree[i].pred_interp_filter = SWITCHABLE;
+        td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
       for (i = 0; i < 64; ++i) {
-        cpi->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
-        cpi->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
-        cpi->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
-        cpi->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
-      }
-    }
-
-    vp9_zero(cpi->mb.pred_mv);
-    cpi->pc_root->index = 0;
-
-    // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
-    // quality encoding. Need to evaluate it in real-time encoding later to
-    // decide if it can be removed too. And then, do the code cleanup.
-    cpi->mb.source_variance = UINT_MAX;
-    if (sf->partition_search_type == FIXED_PARTITION) {
-      set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-      set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col,
-                             sf->always_this_block_size);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+        td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+        td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+      }
+    }
+
+    vp9_zero(x->pred_mv);
+    td->pc_root->index = 0;
+
+    if (seg->enabled) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+    }
+
+    x->source_variance = UINT_MAX;
+    if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+      const BLOCK_SIZE bsize =
+          seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-      bsize = get_rd_var_based_fixed_partition(cpi, mi_row, mi_col);
-      set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+      set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               cm->frame_type != KEY_FRAME ) {
-      choose_partitioning(cpi, tile, mi_row, mi_col);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
-    } else if (sf->partition_search_type == SEARCH_PARTITION &&
-               sf->use_lastframe_partitioning &&
-               (cpi->rc.frames_since_key %
-                   sf->last_partitioning_redo_frequency) &&
-               cm->prev_mi &&
-               cm->show_frame &&
-               cm->frame_type != KEY_FRAME &&
-               !cpi->rc.is_src_frame_alt_ref &&
-               ((sf->use_lastframe_partitioning !=
-                   LAST_FRAME_PARTITION_LOW_MOTION) ||
-                   !sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))) {
-      if (sf->constrain_copy_partition &&
-          sb_has_motion(cm, prev_mi, sf->lf_motion_threshold))
-        constrain_copy_partitioning(cpi, tile, mi, prev_mi,
-                                    mi_row, mi_col, BLOCK_16X16);
-      else
-        copy_partitioning(cm, mi, prev_mi);
-      rd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                       &dummy_rate, &dummy_dist, 1, cpi->pc_root);
+               cm->frame_type != KEY_FRAME) {
+      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+      rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
-        rd_auto_partition_range(cpi, tile, mi_row, mi_col,
-                                &sf->min_partition_size,
-                                &sf->max_partition_size);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+                                &x->min_partition_size,
+                                &x->max_partition_size);
       }
-      rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rdc, INT64_MAX, cpi->pc_root);
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
+                        &dummy_rdc, INT64_MAX, td->pc_root);
     }
   }
 }
 
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
@@ -2641,11 +2891,11 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
-  vpx_memset(xd->above_context[0], 0,
-             sizeof(*xd->above_context[0]) *
-             2 * aligned_mi_cols * MAX_MB_PLANE);
-  vpx_memset(xd->above_seg_context, 0,
-             sizeof(*xd->above_seg_context) * aligned_mi_cols);
+  memset(xd->above_context[0], 0,
+         sizeof(*xd->above_context[0]) *
+         2 * aligned_mi_cols * MAX_MB_PLANE);
+  memset(xd->above_seg_context, 0,
+         sizeof(*xd->above_seg_context) * aligned_mi_cols);
 }
 
 static int check_dual_ref_flags(VP9_COMP *cpi) {
@@ -2662,12 +2912,12 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
 static void reset_skip_tx_size(VP9_COMMON *cm, TX_SIZE max_tx_size) {
   int mi_row, mi_col;
   const int mis = cm->mi_stride;
-  MODE_INFO *mi_ptr = cm->mi;
+  MODE_INFO **mi_ptr = cm->mi_grid_visible;
 
   for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row, mi_ptr += mis) {
     for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-      if (mi_ptr[mi_col].src_mi->mbmi.tx_size > max_tx_size)
-        mi_ptr[mi_col].src_mi->mbmi.tx_size = max_tx_size;
+      if (mi_ptr[mi_col]->mbmi.tx_size > max_tx_size)
+        mi_ptr[mi_col]->mbmi.tx_size = max_tx_size;
     }
   }
 }
@@ -2683,9 +2933,13 @@ static MV_REFERENCE_FRAME get_frame_type(const VP9_COMP *cpi) {
     return LAST_FRAME;
 }
 
-static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
-  if (cpi->mb.e_mbd.lossless)
+static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
+  if (xd->lossless)
     return ONLY_4X4;
+  if (cpi->common.frame_type == KEY_FRAME &&
+      cpi->sf.use_nonrd_pick_mode &&
+      cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+    return ALLOW_16X16;
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
     return ALLOW_32X32;
   else if (cpi->sf.tx_size_search_method == USE_FULL_RD||
@@ -2695,37 +2949,59 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi) {
     return cpi->common.tx_mode;
 }
 
-static void nonrd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                                int mi_row, int mi_col,
-                                int *rate, int64_t *dist,
+static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
+                                     RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                     PICK_MODE_CONTEXT *ctx) {
+  if (bsize < BLOCK_16X16)
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  else
+    vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+}
+
+static void nonrd_pick_sb_modes(VP9_COMP *cpi,
+                                TileDataEnc *tile_data, MACROBLOCK *const x,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
                                 BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi;
-  set_offsets(cpi, tile, mi_row, mi_col, bsize);
-  mbmi = &xd->mi[0].src_mi->mbmi;
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
 
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled)
-    if (mbmi->segment_id && x->in_static_area)
+    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
 
-  if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rate, dist, bsize);
+  if (cm->frame_type == KEY_FRAME)
+    hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+  else if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
+  else if (bsize >= BLOCK_8X8)
+    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col,
+                        rd_cost, bsize, ctx);
   else
-    vp9_pick_inter_mode(cpi, x, tile, mi_row, mi_col, rate, dist, bsize, ctx);
+    vp9_pick_inter_mode_sub8x8(cpi, x, tile_data, mi_row, mi_col,
+                               rd_cost, bsize, ctx);
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+
+  if (rd_cost->rate == INT_MAX)
+    vp9_rd_cost_reset(rd_cost);
+
+  ctx->rate = rd_cost->rate;
+  ctx->dist = rd_cost->dist;
 }
 
 static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
                               int mi_row, int mi_col,
-                              BLOCK_SIZE bsize, BLOCK_SIZE subsize,
+                              BLOCK_SIZE bsize,
                               PC_TREE *pc_tree) {
   MACROBLOCKD *xd = &x->e_mbd;
   int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
 
   assert(bsize >= BLOCK_8X8);
 
@@ -2734,41 +3010,39 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
 
   switch (partition) {
     case PARTITION_NONE:
-      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
-      *(xd->mi[0].src_mi) = pc_tree->none.mic;
+      set_mode_info_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->none.mic;
       duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
       break;
     case PARTITION_VERT:
-      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
-      *(xd->mi[0].src_mi) = pc_tree->vertical[0].mic;
-      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      set_mode_info_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->vertical[0].mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
 
       if (mi_col + hbs < cm->mi_cols) {
-        set_modeinfo_offsets(cm, xd, mi_row, mi_col + hbs);
-        *(xd->mi[0].src_mi) = pc_tree->vertical[1].mic;
-        duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, bsize);
+        set_mode_info_offsets(cm, xd, mi_row, mi_col + hbs);
+        *(xd->mi[0]) = pc_tree->vertical[1].mic;
+        duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col + hbs, subsize);
       }
       break;
     case PARTITION_HORZ:
-      set_modeinfo_offsets(cm, xd, mi_row, mi_col);
-      *(xd->mi[0].src_mi) = pc_tree->horizontal[0].mic;
-      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
+      set_mode_info_offsets(cm, xd, mi_row, mi_col);
+      *(xd->mi[0]) = pc_tree->horizontal[0].mic;
+      duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, subsize);
       if (mi_row + hbs < cm->mi_rows) {
-        set_modeinfo_offsets(cm, xd, mi_row + hbs, mi_col);
-        *(xd->mi[0].src_mi) = pc_tree->horizontal[1].mic;
-        duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, bsize);
+        set_mode_info_offsets(cm, xd, mi_row + hbs, mi_col);
+        *(xd->mi[0]) = pc_tree->horizontal[1].mic;
+        duplicate_mode_info_in_sb(cm, xd, mi_row + hbs, mi_col, subsize);
       }
       break;
     case PARTITION_SPLIT: {
-      BLOCK_SIZE subsubsize = get_subsize(subsize, PARTITION_SPLIT);
-      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize,
-                        subsubsize, pc_tree->split[0]);
+      fill_mode_info_sb(cm, x, mi_row, mi_col, subsize, pc_tree->split[0]);
       fill_mode_info_sb(cm, x, mi_row, mi_col + hbs, subsize,
-                        subsubsize, pc_tree->split[1]);
+                        pc_tree->split[1]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col, subsize,
-                        subsubsize, pc_tree->split[2]);
+                        pc_tree->split[2]);
       fill_mode_info_sb(cm, x, mi_row + hbs, mi_col + hbs, subsize,
-                        subsubsize, pc_tree->split[3]);
+                        pc_tree->split[3]);
       break;
     }
     default:
@@ -2776,24 +3050,39 @@ static void fill_mode_info_sb(VP9_COMMON *cm, MACROBLOCK *x,
   }
 }
 
-static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+// Reset the prediction pixel ready flag recursively.
+static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+  pc_tree->none.pred_pixel_ready = 0;
+  pc_tree->horizontal[0].pred_pixel_ready = 0;
+  pc_tree->horizontal[1].pred_pixel_ready = 0;
+  pc_tree->vertical[0].pred_pixel_ready = 0;
+  pc_tree->vertical[1].pred_pixel_ready = 0;
+
+  if (bsize > BLOCK_8X8) {
+    BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    int i;
+    for (i = 0; i < 4; ++i)
+      pred_pixel_ready_reset(pc_tree->split[i], subsize);
+  }
+}
+
+static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                                 TileDataEnc *tile_data,
                                  TOKENEXTRA **tp, int mi_row,
-                                 int mi_col, BLOCK_SIZE bsize, int *rate,
-                                 int64_t *dist, int do_recon, int64_t best_rd,
+                                 int mi_col, BLOCK_SIZE bsize, RD_COST *rd_cost,
+                                 int do_recon, int64_t best_rd,
                                  PC_TREE *pc_tree) {
   const SPEED_FEATURES *const sf = &cpi->sf;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
   int i;
   BLOCK_SIZE subsize = bsize;
-  int this_rate, sum_rate = 0, best_rate = INT_MAX;
-  int64_t this_dist, sum_dist = 0, best_dist = INT64_MAX;
-  int64_t sum_rd = 0;
+  RD_COST this_rdc, sum_rdc, best_rdc;
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
   // Override skipping rectangular partition operations for edge blocks
@@ -2812,38 +3101,47 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
+  vp9_rd_cost_init(&sum_rdc);
+  vp9_rd_cost_reset(&best_rdc);
+  best_rdc.rdcost = best_rd;
+
   // Determine partition types in search according to the speed features.
   // The threshold set here has to be of square block size.
   if (sf->auto_min_max_partition_size) {
-    partition_none_allowed &= (bsize <= sf->max_partition_size &&
-                               bsize >= sf->min_partition_size);
-    partition_horz_allowed &= ((bsize <= sf->max_partition_size &&
-                                bsize > sf->min_partition_size) ||
+    partition_none_allowed &= (bsize <= x->max_partition_size &&
+                               bsize >= x->min_partition_size);
+    partition_horz_allowed &= ((bsize <= x->max_partition_size &&
+                                bsize > x->min_partition_size) ||
                                 force_horz_split);
-    partition_vert_allowed &= ((bsize <= sf->max_partition_size &&
-                                bsize > sf->min_partition_size) ||
+    partition_vert_allowed &= ((bsize <= x->max_partition_size &&
+                                bsize > x->min_partition_size) ||
                                 force_vert_split);
-    do_split &= bsize > sf->min_partition_size;
+    do_split &= bsize > x->min_partition_size;
   }
   if (sf->use_square_partition_only) {
     partition_horz_allowed &= force_horz_split;
     partition_vert_allowed &= force_vert_split;
   }
 
+  ctx->pred_pixel_ready = !(partition_vert_allowed ||
+                            partition_horz_allowed ||
+                            do_split);
+
   // PARTITION_NONE
   if (partition_none_allowed) {
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, bsize, ctx);
-    ctx->mic.mbmi = xd->mi[0].src_mi->mbmi;
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
+                        &this_rdc, bsize, ctx);
+    ctx->mic.mbmi = xd->mi[0]->mbmi;
     ctx->skip_txfm[0] = x->skip_txfm[0];
     ctx->skip = x->skip;
 
-    if (this_rate != INT_MAX) {
+    if (this_rdc.rate != INT_MAX) {
       int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      this_rate += cpi->partition_cost[pl][PARTITION_NONE];
-      sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
-      if (sum_rd < best_rd) {
-        int dist_breakout_thr = sf->partition_search_breakout_dist_thr;
+      this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                              this_rdc.rate, this_rdc.dist);
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        int64_t dist_breakout_thr = sf->partition_search_breakout_dist_thr;
         int64_t rate_breakout_thr = sf->partition_search_breakout_rate_thr;
 
         dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
@@ -2851,15 +3149,13 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
 
         rate_breakout_thr *= num_pels_log2_lookup[bsize];
 
-        best_rate = this_rate;
-        best_dist = this_dist;
-        best_rd = sum_rd;
+        best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
         if (!x->e_mbd.lossless &&
-            this_rate < rate_breakout_thr &&
-            this_dist < dist_breakout_thr) {
+            this_rdc.rate < rate_breakout_thr &&
+            this_rdc.dist < dist_breakout_thr) {
           do_split = 0;
           do_rect = 0;
         }
@@ -2871,35 +3167,34 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   store_pred_mv(x, ctx);
 
   // PARTITION_SPLIT
-  sum_rd = 0;
   if (do_split) {
     int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-    sum_rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+    sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
     subsize = get_subsize(bsize, PARTITION_SPLIT);
-    for (i = 0; i < 4 && sum_rd < best_rd; ++i) {
+    for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
       const int x_idx = (i & 1) * ms;
       const int y_idx = (i >> 1) * ms;
 
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
       load_pred_mv(x, ctx);
-      nonrd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
-                           subsize, &this_rate, &this_dist, 0,
-                           best_rd - sum_rd, pc_tree->split[i]);
+      nonrd_pick_partition(cpi, td, tile_data, tp,
+                           mi_row + y_idx, mi_col + x_idx,
+                           subsize, &this_rdc, 0,
+                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
       }
     }
 
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_SPLIT;
     } else {
       // skip rectangular partition test when larger block size
@@ -2914,300 +3209,432 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
     subsize = get_subsize(bsize, PARTITION_HORZ);
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
-
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    pc_tree->horizontal[0].pred_pixel_ready = 1;
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
 
-    pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+    pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
     pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->horizontal[0].skip = x->skip;
 
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-
-    if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) {
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + ms < cm->mi_rows) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col,
-                          &this_rate, &this_dist, subsize,
+      pc_tree->horizontal[1].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + ms, mi_col,
+                          &this_rdc, subsize,
                           &pc_tree->horizontal[1]);
 
-      pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+      pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[1].skip = x->skip;
 
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_HORZ];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        this_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rd = sum_rd;
-      best_rate = sum_rate;
-      best_dist = sum_dist;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_HORZ;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
     }
   }
 
   // PARTITION_VERT
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
-
     if (sf->adaptive_motion_search)
       load_pred_mv(x, ctx);
-
-    nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
-                        &this_rate, &this_dist, subsize,
+    pc_tree->vertical[0].pred_pixel_ready = 1;
+    nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
-    pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+    pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
     pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
     pc_tree->vertical[0].skip = x->skip;
-    sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
-    if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
+
+    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + ms < cm->mi_cols) {
       load_pred_mv(x, ctx);
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
-                          &this_rate, &this_dist, subsize,
+      pc_tree->vertical[1].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + ms,
+                          &this_rdc, subsize,
                           &pc_tree->vertical[1]);
-      pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+      pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[1].skip = x->skip;
-      if (this_rate == INT_MAX) {
-        sum_rd = INT64_MAX;
+
+      if (this_rdc.rate == INT_MAX) {
+        vp9_rd_cost_reset(&sum_rdc);
       } else {
         int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rate += cpi->partition_cost[pl][PARTITION_VERT];
-        sum_rate += this_rate;
-        sum_dist += this_dist;
-        sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+        sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                sum_rdc.rate, sum_rdc.dist);
       }
     }
-    if (sum_rd < best_rd) {
-      best_rate = sum_rate;
-      best_dist = sum_dist;
-      best_rd = sum_rd;
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = sum_rdc;
       pc_tree->partitioning = PARTITION_VERT;
+    } else {
+      pred_pixel_ready_reset(pc_tree, bsize);
     }
   }
-  // TODO(JBB): The following line is here just to avoid a static warning
-  // that occurs because at this point we never again reuse best_rd
-  // despite setting it here.  The code should be refactored to avoid this.
-  (void) best_rd;
 
-  *rate = best_rate;
-  *dist = best_dist;
+  *rd_cost = best_rdc;
 
-  if (best_rate == INT_MAX)
+  if (best_rdc.rate == INT_MAX) {
+    vp9_rd_cost_reset(rd_cost);
     return;
+  }
 
   // update mode info array
-  subsize = get_subsize(bsize, pc_tree->partitioning);
-  fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, subsize,
-                    pc_tree);
+  fill_mode_info_sb(cm, x, mi_row, mi_col, bsize, pc_tree);
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && do_recon) {
     int output_enabled = (bsize == BLOCK_64X64);
-
-    // Check the projected output rate for this SB against it's target
-    // and and if necessary apply a Q delta using segmentation to get
-    // closer to the target.
-    if ((oxcf->aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
-      vp9_select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled,
-                                    best_rate);
-    }
-
-    if (oxcf->aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              best_rate, best_dist);
-
-    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize, pc_tree);
+    encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                 bsize, pc_tree);
   }
 
-  if (bsize == BLOCK_64X64) {
+  if (bsize == BLOCK_64X64 && do_recon) {
     assert(tp_orig < *tp);
-    assert(best_rate < INT_MAX);
-    assert(best_dist < INT64_MAX);
+    assert(best_rdc.rate < INT_MAX);
+    assert(best_rdc.dist < INT64_MAX);
   } else {
     assert(tp_orig == *tp);
   }
 }
 
+static void nonrd_select_partition(VP9_COMP *cpi,
+                                   ThreadData *td,
+                                   TileDataEnc *tile_data,
+                                   MODE_INFO **mi,
+                                   TOKENEXTRA **tp,
+                                   int mi_row, int mi_col,
+                                   BLOCK_SIZE bsize, int output_enabled,
+                                   RD_COST *rd_cost, PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  const int mis = cm->mi_stride;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc;
+
+  vp9_rd_cost_reset(&this_rdc);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
+  partition = partition_lookup[bsl][subsize];
+
+  if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+      subsize >= BLOCK_16X16) {
+    x->max_partition_size = BLOCK_32X32;
+    x->min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_16X16 && partition != PARTITION_NONE) {
+    x->max_partition_size = BLOCK_16X16;
+    x->min_partition_size = BLOCK_8X8;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        pc_tree->none.pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->none);
+        pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->none.skip = x->skip;
+        break;
+      case PARTITION_VERT:
+        pc_tree->vertical[0].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->vertical[0]);
+        pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->vertical[0].skip = x->skip;
+        if (mi_col + hbs < cm->mi_cols) {
+          pc_tree->vertical[1].pred_pixel_ready = 1;
+          nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+                              &this_rdc, subsize, &pc_tree->vertical[1]);
+          pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
+          pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->vertical[1].skip = x->skip;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_HORZ:
+        pc_tree->horizontal[0].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+                            subsize, &pc_tree->horizontal[0]);
+        pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
+        pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
+        pc_tree->horizontal[0].skip = x->skip;
+        if (mi_row + hbs < cm->mi_rows) {
+          pc_tree->horizontal[1].pred_pixel_ready = 1;
+          nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+                              &this_rdc, subsize, &pc_tree->horizontal[1]);
+          pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
+          pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
+          pc_tree->horizontal[1].skip = x->skip;
+          if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+              rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+            rd_cost->rate += this_rdc.rate;
+            rd_cost->dist += this_rdc.dist;
+          }
+        }
+        break;
+      case PARTITION_SPLIT:
+        subsize = get_subsize(bsize, PARTITION_SPLIT);
+        nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                               subsize, output_enabled, rd_cost,
+                               pc_tree->split[0]);
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs, tp,
+                               mi_row, mi_col + hbs, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[1]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+                               mi_row + hbs, mi_col, subsize, output_enabled,
+                               &this_rdc, pc_tree->split[2]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        nonrd_select_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+                               mi_row + hbs, mi_col + hbs, subsize,
+                               output_enabled, &this_rdc, pc_tree->split[3]);
+        if (this_rdc.rate != INT_MAX && this_rdc.dist != INT64_MAX &&
+            rd_cost->rate != INT_MAX && rd_cost->dist != INT64_MAX) {
+          rd_cost->rate += this_rdc.rate;
+          rd_cost->dist += this_rdc.dist;
+        }
+        break;
+      default:
+        assert(0 && "Invalid partition type.");
+        break;
+    }
+  }
+
+  if (bsize == BLOCK_64X64 && output_enabled)
+    encode_sb_rt(cpi, td, tile_info, tp, mi_row, mi_col, 1, bsize, pc_tree);
+}
+
+
 static void nonrd_use_partition(VP9_COMP *cpi,
-                                const TileInfo *const tile,
-                                MODE_INFO *mi,
+                                ThreadData *td,
+                                TileDataEnc *tile_data,
+                                MODE_INFO **mi,
                                 TOKENEXTRA **tp,
                                 int mi_row, int mi_col,
                                 BLOCK_SIZE bsize, int output_enabled,
-                                int *totrate, int64_t *totdist,
-                                PC_TREE *pc_tree) {
+                                RD_COST *dummy_cost, PC_TREE *pc_tree) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
   const int mis = cm->mi_stride;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  subsize = (bsize >= BLOCK_8X8) ? mi[0].src_mi->mbmi.sb_type : BLOCK_4X4;
+  subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
   partition = partition_lookup[bsl][subsize];
 
+  if (output_enabled && bsize != BLOCK_4X4) {
+    int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+    td->counts->partition[ctx][partition]++;
+  }
+
   switch (partition) {
     case PARTITION_NONE:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      pc_tree->none.pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->none);
-      pc_tree->none.mic.mbmi = xd->mi[0].src_mi->mbmi;
+      pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->none.skip_txfm[0] = x->skip_txfm[0];
       pc_tree->none.skip = x->skip;
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->none);
       break;
     case PARTITION_VERT:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      pc_tree->vertical[0].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->vertical[0]);
-      pc_tree->vertical[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+      pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->vertical[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->vertical[0].skip = x->skip;
-      if (mi_col + hbs < cm->mi_cols) {
-        nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
-                            &rate, &dist, subsize, &pc_tree->vertical[1]);
-        pc_tree->vertical[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->vertical[0]);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        pc_tree->vertical[1].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs,
+                            dummy_cost, subsize, &pc_tree->vertical[1]);
+        pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
         pc_tree->vertical[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->vertical[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
-        }
+        encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col + hbs,
+                    output_enabled, subsize, &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
-      nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist,
+      pc_tree->horizontal[0].pred_pixel_ready = 1;
+      nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
                           subsize, &pc_tree->horizontal[0]);
-      pc_tree->horizontal[0].mic.mbmi = xd->mi[0].src_mi->mbmi;
+      pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
       pc_tree->horizontal[0].skip_txfm[0] = x->skip_txfm[0];
       pc_tree->horizontal[0].skip = x->skip;
-      if (mi_row + hbs < cm->mi_rows) {
-        nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
-                            &rate, &dist, subsize, &pc_tree->horizontal[0]);
-        pc_tree->horizontal[1].mic.mbmi = xd->mi[0].src_mi->mbmi;
+      encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
+                  subsize, &pc_tree->horizontal[0]);
+
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        pc_tree->horizontal[1].pred_pixel_ready = 1;
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col,
+                            dummy_cost, subsize, &pc_tree->horizontal[1]);
+        pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
         pc_tree->horizontal[1].skip_txfm[0] = x->skip_txfm[0];
         pc_tree->horizontal[1].skip = x->skip;
-        if (rate != INT_MAX && dist != INT64_MAX &&
-            *totrate != INT_MAX && *totdist != INT64_MAX) {
-          *totrate += rate;
-          *totdist += dist;
-        }
+        encode_b_rt(cpi, td, tile_info, tp, mi_row + hbs, mi_col,
+                    output_enabled, subsize, &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       subsize = get_subsize(bsize, PARTITION_SPLIT);
-      nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                          subsize, output_enabled, totrate, totdist,
-                          pc_tree->split[0]);
-      nonrd_use_partition(cpi, tile, mi + hbs, tp,
-                          mi_row, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[1]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
-      }
-      nonrd_use_partition(cpi, tile, mi + hbs * mis, tp,
-                          mi_row + hbs, mi_col, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[2]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
-      }
-      nonrd_use_partition(cpi, tile, mi + hbs * mis + hbs, tp,
-                          mi_row + hbs, mi_col + hbs, subsize, output_enabled,
-                          &rate, &dist, pc_tree->split[3]);
-      if (rate != INT_MAX && dist != INT64_MAX &&
-          *totrate != INT_MAX && *totdist != INT64_MAX) {
-        *totrate += rate;
-        *totdist += dist;
+      if (bsize == BLOCK_8X8) {
+        nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
+                            subsize, pc_tree->leaf_split[0]);
+        encode_b_rt(cpi, td, tile_info, tp, mi_row, mi_col,
+                    output_enabled, subsize, pc_tree->leaf_split[0]);
+      } else {
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            subsize, output_enabled, dummy_cost,
+                            pc_tree->split[0]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs, tp,
+                            mi_row, mi_col + hbs, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[1]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis, tp,
+                            mi_row + hbs, mi_col, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[2]);
+        nonrd_use_partition(cpi, td, tile_data, mi + hbs * mis + hbs, tp,
+                            mi_row + hbs, mi_col + hbs, subsize, output_enabled,
+                            dummy_cost, pc_tree->split[3]);
       }
       break;
     default:
-      assert("Invalid partition type.");
+      assert(0 && "Invalid partition type.");
       break;
   }
 
-  if (bsize == BLOCK_64X64 && output_enabled) {
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
-      vp9_cyclic_refresh_set_rate_and_dist_sb(cpi->cyclic_refresh,
-                                              *totrate, *totdist);
-    encode_sb_rt(cpi, tile, tp, mi_row, mi_col, 1, bsize, pc_tree);
-  }
+  if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                                int mi_row, TOKENEXTRA **tp) {
+static void encode_nonrd_sb_row(VP9_COMP *cpi,
+                                ThreadData *td,
+                                TileDataEnc *tile_data,
+                                int mi_row,
+                                TOKENEXTRA **tp) {
   SPEED_FEATURES *const sf = &cpi->sf;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int mi_col;
 
   // Initialize the left context for the new SB row
-  vpx_memset(&xd->left_context, 0, sizeof(xd->left_context));
-  vpx_memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+  memset(&xd->left_context, 0, sizeof(xd->left_context));
+  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
 
   // Code each SB in the row
-  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+  for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
        mi_col += MI_BLOCK_SIZE) {
-    int dummy_rate = 0;
-    int64_t dummy_dist = 0;
+    const struct segmentation *const seg = &cm->seg;
+    RD_COST dummy_rdc;
     const int idx_str = cm->mi_stride * mi_row + mi_col;
-    MODE_INFO *mi = cm->mi + idx_str;
-    BLOCK_SIZE bsize;
-    x->in_static_area = 0;
+    MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
+    BLOCK_SIZE bsize = BLOCK_64X64;
+    int seg_skip = 0;
     x->source_variance = UINT_MAX;
     vp9_zero(x->pred_mv);
+    vp9_rd_cost_init(&dummy_rdc);
+    x->color_sensitivity[0] = 0;
+    x->color_sensitivity[1] = 0;
+
+    if (seg->enabled) {
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      int segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+      if (seg_skip) {
+        partition_search_type = FIXED_PARTITION;
+      }
+    }
 
     // Set the partition type of the 64X64 block
-    switch (sf->partition_search_type) {
+    switch (partition_search_type) {
       case VAR_BASED_PARTITION:
-        choose_partitioning(cpi, tile, mi_row, mi_col);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        // TODO(jingning, marpan): The mode decision and encoding process
+        // support both intra and inter sub8x8 block coding for RTC mode.
+        // Tune the thresholds accordingly to use sub8x8 block coding for
+        // coding performance improvement.
+        choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
       case SOURCE_VAR_BASED_PARTITION:
-        set_source_var_based_partition(cpi, tile, mi, mi_row, mi_col);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
       case FIXED_PARTITION:
-        bsize = sf->partition_search_type == FIXED_PARTITION ?
-                sf->always_this_block_size :
-                get_nonrd_var_based_fixed_partition(cpi, mi_row, mi_col);
-        set_fixed_partitioning(cpi, tile, mi, mi_row, mi_col, bsize);
-        nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col, BLOCK_64X64,
-                            1, &dummy_rate, &dummy_dist, cpi->pc_root);
+        if (!seg_skip)
+          bsize = sf->always_this_block_size;
+        set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+        nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                            BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
       case REFERENCE_PARTITION:
-        if (sf->partition_check ||
-            !(x->in_static_area = is_background(cpi, tile, mi_row, mi_col))) {
-          set_modeinfo_offsets(cm, xd, mi_row, mi_col);
-          auto_partition_range(cpi, tile, mi_row, mi_col,
-                               &sf->min_partition_size,
-                               &sf->max_partition_size);
-          nonrd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
-                               &dummy_rate, &dummy_dist, 1, INT64_MAX,
-                               cpi->pc_root);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+            xd->mi[0]->mbmi.segment_id) {
+          x->max_partition_size = BLOCK_64X64;
+          x->min_partition_size = BLOCK_8X8;
+          nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                               BLOCK_64X64, &dummy_rdc, 1,
+                               INT64_MAX, td->pc_root);
         } else {
-          choose_partitioning(cpi, tile, mi_row, mi_col);
-          nonrd_use_partition(cpi, tile, mi, tp, mi_row, mi_col,
-                              BLOCK_64X64, 1, &dummy_rate, &dummy_dist,
-                              cpi->pc_root);
+          choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+          nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+                                 BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         }
+
         break;
       default:
         assert(0);
@@ -3230,13 +3657,13 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
   const int cutoff = (MIN(cm->width, cm->height) >= 720) ?
       (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) :
       (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
-  DECLARE_ALIGNED_ARRAY(16, int, hist, VAR_HIST_BINS);
+  DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
   diff *var16 = cpi->source_diff_var;
 
   int sum = 0;
   int i, j;
 
-  vpx_memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
+  memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0]));
 
   for (i = 0; i < cm->mb_rows; i++) {
     for (j = 0; j < cm->mb_cols; j++) {
@@ -3315,9 +3742,9 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
       if (cpi->source_diff_var)
         vpx_free(cpi->source_diff_var);
 
-        CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                        vpx_calloc(cm->MBs, sizeof(diff)));
-      }
+      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
+                      vpx_calloc(cm->MBs, sizeof(diff)));
+    }
 
     if (!cpi->frames_till_next_var_check)
       cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi);
@@ -3329,13 +3756,13 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
   }
 }
 
-static int get_skip_encode_frame(const VP9_COMMON *cm) {
+static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) {
   unsigned int intra_count = 0, inter_count = 0;
   int j;
 
   for (j = 0; j < INTRA_INTER_CONTEXTS; ++j) {
-    intra_count += cm->counts.intra_inter[j][0];
-    inter_count += cm->counts.intra_inter[j][1];
+    intra_count += td->counts->intra_inter[j][0];
+    inter_count += td->counts->intra_inter[j][1];
   }
 
   return (intra_count << 2) < inter_count &&
@@ -3343,47 +3770,80 @@ static int get_skip_encode_frame(const VP9_COMMON *cm) {
          cm->show_frame;
 }
 
-static void encode_tiles(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
+void vp9_init_tile_data(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
-
   int tile_col, tile_row;
-  TileInfo tile[4][1 << 6];
-  TOKENEXTRA *tok[4][1 << 6];
-  TOKENEXTRA *pre_tok = cpi->tok;
+  TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
   int tile_tok = 0;
 
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      vp9_tile_init(&tile[tile_row][tile_col], cm, tile_row, tile_col);
-
-      tok[tile_row][tile_col] = pre_tok + tile_tok;
-      pre_tok = tok[tile_row][tile_col];
-      tile_tok = allocated_tokens(tile[tile_row][tile_col]);
-    }
+  if (cpi->tile_data == NULL) {
+    CHECK_MEM_ERROR(cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        int i, j;
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact[i][j] = 32;
+            tile_data->mode_map[i][j] = j;
+          }
+        }
+      }
   }
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      const TileInfo * const ptile = &tile[tile_row][tile_col];
-      TOKENEXTRA * const old_tok = tok[tile_row][tile_col];
-      int mi_row;
+      TileInfo *tile_info =
+          &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
+      vp9_tile_init(tile_info, cm, tile_row, tile_col);
 
-      for (mi_row = ptile->mi_row_start; mi_row < ptile->mi_row_end;
-           mi_row += MI_BLOCK_SIZE) {
-        if (cpi->sf.use_nonrd_pick_mode && !frame_is_intra_only(cm))
-          encode_nonrd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
-        else
-          encode_rd_sb_row(cpi, ptile, mi_row, &tok[tile_row][tile_col]);
-      }
-      cpi->tok_count[tile_row][tile_col] =
-          (unsigned int)(tok[tile_row][tile_col] - old_tok);
-      assert(tok[tile_row][tile_col] - old_tok <= allocated_tokens(*ptile));
+      cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+      pre_tok = cpi->tile_tok[tile_row][tile_col];
+      tile_tok = allocated_tokens(*tile_info);
     }
   }
 }
 
+void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td,
+                     int tile_row, int tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  TileDataEnc *this_tile =
+      &cpi->tile_data[tile_row * tile_cols + tile_col];
+  const TileInfo * const tile_info = &this_tile->tile_info;
+  TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  int mi_row;
+
+  for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+       mi_row += MI_BLOCK_SIZE) {
+    if (cpi->sf.use_nonrd_pick_mode)
+      encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+    else
+      encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+  }
+  cpi->tok_count[tile_row][tile_col] =
+      (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
+  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
+      allocated_tokens(*tile_info));
+}
+
+static void encode_tiles(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+
+  vp9_init_tile_data(cpi);
+
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+      vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+}
+
 #if CONFIG_FP_MB_STATS
 static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
                             VP9_COMMON *cm, uint8_t **this_frame_mb_stats) {
@@ -3402,18 +3862,20 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
 static void encode_frame_internal(VP9_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
   RD_OPT *const rd_opt = &cpi->rd;
-  MACROBLOCK *const x = &cpi->mb;
+  ThreadData *const td = &cpi->td;
+  MACROBLOCK *const x = &td->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
-  xd->mi = cm->mi;
-  xd->mi[0].src_mi = &xd->mi[0];
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
 
-  vp9_zero(cm->counts);
-  vp9_zero(cpi->coef_counts);
-  vp9_zero(rd_opt->comp_pred_diff);
-  vp9_zero(rd_opt->filter_diff);
-  vp9_zero(rd_opt->tx_select_diff);
+  vp9_zero(*td->counts);
+  vp9_zero(rdc->coef_counts);
+  vp9_zero(rdc->comp_pred_diff);
+  vp9_zero(rdc->filter_diff);
+  vp9_zero(rdc->tx_select_diff);
   vp9_zero(rd_opt->tx_select_threshes);
 
   xd->lossless = cm->base_qindex == 0 &&
@@ -3421,13 +3883,11 @@ static void encode_frame_internal(VP9_COMP *cpi) {
                  cm->uv_dc_delta_q == 0 &&
                  cm->uv_ac_delta_q == 0;
 
-  cm->tx_mode = select_tx_mode(cpi);
-
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-  else
     x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+  else
+    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vp9_fdct4x4;
   x->highbd_itxm_add = xd->lossless ? vp9_highbd_iwht4x4_add :
                                       vp9_highbd_idct4x4_add;
 #else
@@ -3435,18 +3895,25 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
-  if (xd->lossless) {
+  if (xd->lossless)
     x->optimize = 0;
-    cm->lf.filter_level = 0;
-    cpi->zbin_mode_boost_enabled = 0;
-  }
+
+  cm->tx_mode = select_tx_mode(cpi, xd);
 
   vp9_frame_init_quantizer(cpi);
 
   vp9_initialize_rd_consts(cpi);
-  vp9_initialize_me_consts(cpi, cm->base_qindex);
+  vp9_initialize_me_consts(cpi, x, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
-  set_prev_mi(cm);
+  cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
+                           cm->width == cm->last_width &&
+                           cm->height == cm->last_height &&
+                           !cm->intra_only &&
+                           cm->last_show_frame;
+  // Special case: set prev_mi to NULL when the previous mode info
+  // context cannot be used.
+  cm->prev_mi = cm->use_prev_frame_mvs ?
+                cm->prev_mip + cm->mi_stride + 1 : NULL;
 
   x->quant_fp = cpi->sf.use_quant_fp;
   vp9_zero(x->skip_txfm);
@@ -3456,7 +3923,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     int i;
     struct macroblock_plane *const p = x->plane;
     struct macroblockd_plane *const pd = xd->plane;
-    PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
+    PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
 
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       p[i].coeff = ctx->coeff_pbuf[i][0];
@@ -3466,6 +3933,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     }
     vp9_zero(x->zcoeff_blk);
 
+    if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0)
+      cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+
     if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
       source_var_based_partition_search_method(cpi);
   }
@@ -3481,13 +3951,18 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   }
 #endif
 
-    encode_tiles(cpi);
+    // If allowed, encoding tiles in parallel with one thread handling one tile.
+    if (MIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+      vp9_encode_tiles_mt(cpi);
+    else
+      encode_tiles(cpi);
 
     vpx_usec_timer_mark(&emr_timer);
     cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
   }
 
-  sf->skip_encode_frame = sf->skip_encode_sb ? get_skip_encode_frame(cm) : 0;
+  sf->skip_encode_frame = sf->skip_encode_sb ?
+      get_skip_encode_frame(cm, td) : 0;
 
 #if 0
   // Keep record of the total distortion this time around for future use
@@ -3514,7 +3989,6 @@ static INTERP_FILTER get_interp_filter(
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  RD_OPT *const rd_opt = &cpi->rd;
 
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
@@ -3527,9 +4001,9 @@ void vp9_encode_frame(VP9_COMP *cpi) {
              cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
         (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
              cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cm->allow_comp_inter_inter = 0;
+      cpi->allow_comp_inter_inter = 0;
     } else {
-      cm->allow_comp_inter_inter = 1;
+      cpi->allow_comp_inter_inter = 1;
       cm->comp_fixed_ref = ALTREF_FRAME;
       cm->comp_var_ref[0] = LAST_FRAME;
       cm->comp_var_ref[1] = GOLDEN_FRAME;
@@ -3538,6 +4012,9 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
   if (cpi->sf.frame_parameter_update) {
     int i;
+    RD_OPT *const rd_opt = &cpi->rd;
+    FRAME_COUNTS *counts = cpi->td.counts;
+    RD_COUNTS *const rdc = &cpi->td.rd_counts;
 
     // This code does a single RD pass over the whole frame assuming
     // either compound, single or hybrid prediction as per whatever has
@@ -3553,7 +4030,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
-    if (is_alt_ref || !cm->allow_comp_inter_inter)
+    if (is_alt_ref || !cpi->allow_comp_inter_inter)
       cm->reference_mode = SINGLE_REFERENCE;
     else if (mode_thrs[COMPOUND_REFERENCE] > mode_thrs[SINGLE_REFERENCE] &&
              mode_thrs[COMPOUND_REFERENCE] >
@@ -3572,15 +4049,16 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i)
-      mode_thrs[i] = (mode_thrs[i] + rd_opt->comp_pred_diff[i] / cm->MBs) / 2;
+      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      filter_thrs[i] = (filter_thrs[i] + rd_opt->filter_diff[i] / cm->MBs) / 2;
+      filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
 
     for (i = 0; i < TX_MODES; ++i) {
-      int64_t pd = rd_opt->tx_select_diff[i];
+      int64_t pd = rdc->tx_select_diff[i];
       if (i == TX_MODE_SELECT)
-        pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, 2048 * (TX_SIZES - 1), 0);
+        pd -= RDCOST(cpi->td.mb.rdmult, cpi->td.mb.rddiv, 2048 * (TX_SIZES - 1),
+                     0);
       tx_thrs[i] = (tx_thrs[i] + (int)(pd / cm->MBs)) / 2;
     }
 
@@ -3589,16 +4067,16 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       int comp_count_zero = 0;
 
       for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
-        single_count_zero += cm->counts.comp_inter[i][0];
-        comp_count_zero += cm->counts.comp_inter[i][1];
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
       }
 
       if (comp_count_zero == 0) {
         cm->reference_mode = SINGLE_REFERENCE;
-        vp9_zero(cm->counts.comp_inter);
+        vp9_zero(counts->comp_inter);
       } else if (single_count_zero == 0) {
         cm->reference_mode = COMPOUND_REFERENCE;
-        vp9_zero(cm->counts.comp_inter);
+        vp9_zero(counts->comp_inter);
       }
     }
 
@@ -3609,19 +4087,18 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       int count32x32 = 0;
 
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += cm->counts.tx.p32x32[i][TX_4X4];
-        count4x4 += cm->counts.tx.p16x16[i][TX_4X4];
-        count4x4 += cm->counts.tx.p8x8[i][TX_4X4];
+        count4x4 += counts->tx.p32x32[i][TX_4X4];
+        count4x4 += counts->tx.p16x16[i][TX_4X4];
+        count4x4 += counts->tx.p8x8[i][TX_4X4];
 
-        count8x8_lp += cm->counts.tx.p32x32[i][TX_8X8];
-        count8x8_lp += cm->counts.tx.p16x16[i][TX_8X8];
-        count8x8_8x8p += cm->counts.tx.p8x8[i][TX_8X8];
+        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
+        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
+        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
 
-        count16x16_16x16p += cm->counts.tx.p16x16[i][TX_16X16];
-        count16x16_lp += cm->counts.tx.p32x32[i][TX_16X16];
-        count32x32 += cm->counts.tx.p32x32[i][TX_32X32];
+        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
+        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
+        count32x32 += counts->tx.p32x32[i][TX_32X32];
       }
-
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
           count32x32 == 0) {
         cm->tx_mode = ALLOW_8X8;
@@ -3662,32 +4139,15 @@ static void sum_intra_stats(FRAME_COUNTS *counts, const MODE_INFO *mi) {
   ++counts->uv_mode[y_mode][uv_mode];
 }
 
-static int get_zbin_mode_boost(const MB_MODE_INFO *mbmi, int enabled) {
-  if (enabled) {
-    if (is_inter_block(mbmi)) {
-      if (mbmi->mode == ZEROMV) {
-        return mbmi->ref_frame[0] != LAST_FRAME ? GF_ZEROMV_ZBIN_BOOST
-                                                : LF_ZEROMV_ZBIN_BOOST;
-      } else {
-        return mbmi->sb_type < BLOCK_8X8 ? SPLIT_MV_ZBIN_BOOST
-                                         : MV_ZBIN_BOOST;
-      }
-    } else {
-      return INTRA_ZBIN_BOOST;
-    }
-  } else {
-    return 0;
-  }
-}
-
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+static void encode_superblock(VP9_COMP *cpi, ThreadData *td,
+                              TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi_8x8 = xd->mi;
-  MODE_INFO *mi = mi_8x8;
+  MODE_INFO **mi_8x8 = xd->mi;
+  MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   const int seg_skip = vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                              SEG_LVL_SKIP);
@@ -3701,7 +4161,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
                    cpi->sf.allow_skip_recode;
 
   if (!x->skip_recode && !cpi->sf.use_nonrd_pick_mode)
-    vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+    memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
 
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
@@ -3714,36 +4174,31 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
 
   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
 
-  // Experimental code. Special case for gf and arf zeromv modes.
-  // Increase zbin size to suppress noise
-  cpi->zbin_mode_boost = get_zbin_mode_boost(mbmi,
-                                             cpi->zbin_mode_boost_enabled);
-  vp9_update_zbin_extra(cpi, x);
-
   if (!is_inter_block(mbmi)) {
     int plane;
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
       vp9_encode_intra_block_plane(x, MAX(bsize, BLOCK_8X8), plane);
     if (output_enabled)
-      sum_intra_stats(&cm->counts, mi);
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+      sum_intra_stats(td->counts, mi);
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
                                                      mbmi->ref_frame[ref]);
+      assert(cfg != NULL);
       vp9_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
                            &xd->block_refs[ref]->sf);
     }
-    if (!cpi->sf.reuse_inter_pred_sby || seg_skip)
+    if (!(cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready) || seg_skip)
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, MAX(bsize, BLOCK_8X8));
 
     vp9_encode_sb(x, MAX(bsize, BLOCK_8X8));
-    vp9_tokenize_sb(cpi, t, !output_enabled, MAX(bsize, BLOCK_8X8));
+    vp9_tokenize_sb(cpi, td, t, !output_enabled, MAX(bsize, BLOCK_8X8));
   }
 
   if (output_enabled) {
@@ -3751,7 +4206,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
         mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
       ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
-                      &cm->counts.tx)[mbmi->tx_size];
+                      &td->counts->tx)[mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE tx_size;
@@ -3766,7 +4221,9 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       for (y = 0; y < mi_height; y++)
         for (x = 0; x < mi_width; x++)
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
-            mi_8x8[mis * y + x].src_mi->mbmi.tx_size = tx_size;
+            mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
     }
+    ++td->counts->tx.tx_totals[mbmi->tx_size];
+    ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
index fd1c9aa6427..1acde0283e9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -12,6 +12,8 @@
 #ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
 #define VP9_ENCODER_VP9_ENCODEFRAME_H_
 
+#include "vpx/vpx_integer.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -19,6 +21,7 @@ extern "C" {
 struct macroblock;
 struct yv12_buffer_config;
 struct VP9_COMP;
+struct ThreadData;
 
 // Constants used in SOURCE_VAR_BASED_PARTITION
 #define VAR_HIST_MAX_BG_VAR 1000
@@ -33,6 +36,12 @@ void vp9_setup_src_planes(struct macroblock *x,
 
 void vp9_encode_frame(struct VP9_COMP *cpi);
 
+void vp9_init_tile_data(struct VP9_COMP *cpi);
+void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
+                     int tile_row, int tile_col);
+
+void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
index f5faa7c23ac..9a4e61ec882 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -29,12 +29,6 @@ struct optimize_ctx {
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
 };
 
-struct encode_b_args {
-  MACROBLOCK *x;
-  struct optimize_ctx *ctx;
-  int8_t *skip;
-};
-
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff, ptrdiff_t diff_stride,
                           const uint8_t *src, ptrdiff_t src_stride,
@@ -99,7 +93,7 @@ typedef struct vp9_token_state {
   int           rate;
   int           error;
   int           next;
-  signed char   token;
+  int16_t       token;
   short         qc;
 } vp9_token_state;
 
@@ -134,7 +128,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int ref = is_inter_block(&xd->mi[0].src_mi->mbmi);
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   uint8_t token_cache[1024];
@@ -153,10 +147,15 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   int next = eob, sz = 0;
   int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1, t0, t1;
+  int rate0, rate1, error0, error1;
+  int16_t t0, t1;
+  EXTRABIT e0;
   int best, band, pt, i, final_eob;
-  const TOKENVALUE *dct_value_tokens;
-  const int16_t *dct_value_cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
@@ -173,24 +172,9 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   tokens[eob][0].qc = 0;
   tokens[eob][1] = tokens[eob][0];
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->bd == 12) {
-    dct_value_tokens = vp9_dct_value_tokens_high12_ptr;
-    dct_value_cost = vp9_dct_value_cost_high12_ptr;
-  } else if (xd->bd == 10) {
-    dct_value_tokens = vp9_dct_value_tokens_high10_ptr;
-    dct_value_cost = vp9_dct_value_cost_high10_ptr;
-  } else {
-    dct_value_tokens = vp9_dct_value_tokens_ptr;
-    dct_value_cost = vp9_dct_value_cost_ptr;
-  }
-#else
-  dct_value_tokens = vp9_dct_value_tokens_ptr;
-  dct_value_cost = vp9_dct_value_cost_ptr;
-#endif
   for (i = 0; i < eob; i++)
     token_cache[scan[i]] =
-        vp9_pt_energy_class[dct_value_tokens[qcoeff[scan[i]]].token];
+        vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
 
   for (i = eob; i-- > 0;) {
     int base_bits, d2, dx;
@@ -204,7 +188,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      t0 = (dct_value_tokens + x)->token;
+      vp9_get_token_extra(x, &t0, &e0);
       /* Consider both possible successor states. */
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -217,7 +201,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = dct_value_cost[x];
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
       dx = mul * (dqcoeff[rc] - coeff[rc]);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -255,8 +239,10 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
          */
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        e0 = 0;
       } else {
-        t0 = t1 = (dct_value_tokens + x)->token;
+        vp9_get_token_extra(x, &t0, &e0);
+        t1 = t0;
       }
       if (next < default_eob) {
         band = band_translate[i + 1];
@@ -275,7 +261,7 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
       UPDATE_RD_COST();
       /* And pick the best. */
       best = rd_cost1 < rd_cost0;
-      base_bits = dct_value_cost[x];
+      base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
 
       if (shortcut) {
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -333,8 +319,8 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = -1;
-  vpx_memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
-  vpx_memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
+  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
@@ -397,28 +383,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
         vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
                                      p->round_fp, p->quant_fp, p->quant_shift,
                                      qcoeff, dqcoeff, pd->dequant,
-                                     p->zbin_extra, eob, scan_order->scan,
+                                     eob, scan_order->scan,
                                      scan_order->iscan);
         break;
       case TX_16X16:
         vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, p->zbin_extra, eob,
+                               pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, p->zbin_extra, eob,
+                               pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
       case TX_4X4:
         x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, p->zbin_extra, eob,
+                               pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
       default:
@@ -433,28 +419,28 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                            pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
       break;
     case TX_16X16:
       vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
+                      pd->dequant, eob,
                       scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      vp9_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
-                      scan_order->scan, scan_order->iscan);
+      vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
+                        x->skip_block, p->zbin, p->round_fp,
+                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                        pd->dequant, eob,
+                        scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, p->zbin_extra, eob,
+                      pd->dequant, eob,
                       scan_order->scan, scan_order->iscan);
       break;
     default:
@@ -490,19 +476,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
         break;
       case TX_16X16:
         vp9_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+        vp9_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff,
                                pd->dequant[0], eob);
         break;
       case TX_8X8:
         vp9_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+        vp9_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff,
                                pd->dequant[0], eob);
         break;
       case TX_4X4:
         x->fwd_txm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_dc(coeff, x->skip_block, p->round,
+        vp9_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff,
                                pd->dequant[0], eob);
         break;
@@ -522,19 +508,19 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
       break;
     case TX_16X16:
       vp9_fdct16x16_1(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vp9_quantize_dc(coeff, 256, x->skip_block, p->round,
                      p->quant_fp[0], qcoeff, dqcoeff,
                      pd->dequant[0], eob);
       break;
     case TX_8X8:
       vp9_fdct8x8_1(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vp9_quantize_dc(coeff, 64, x->skip_block, p->round,
                       p->quant_fp[0], qcoeff, dqcoeff,
                       pd->dequant[0], eob);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_dc(coeff, x->skip_block, p->round,
+      vp9_quantize_dc(coeff, 16, x->skip_block, p->round,
                       p->quant_fp[0], qcoeff, dqcoeff,
                       pd->dequant[0], eob);
       break;
@@ -567,28 +553,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
                                     p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, p->zbin_extra, eob,
+                                    dqcoeff, pd->dequant, eob,
                                     scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vp9_highbd_fdct16x16(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, p->zbin_extra, eob,
+                              pd->dequant, eob,
                               scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vp9_highbd_fdct8x8(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, p->zbin_extra, eob,
+                              pd->dequant, eob,
                               scan_order->scan, scan_order->iscan);
         break;
       case TX_4X4:
         x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, p->zbin_extra, eob,
+                              pd->dequant, eob,
                               scan_order->scan, scan_order->iscan);
         break;
       default:
@@ -603,28 +589,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           pd->dequant, eob, scan_order->scan,
                            scan_order->iscan);
       break;
     case TX_16X16:
       vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
       vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
       x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob,
+                     pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
     default:
@@ -659,24 +645,34 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   }
 
   if (!x->skip_recode) {
-    if (max_txsize_lookup[plane_bsize] == tx_size) {
-      if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 0) {
-        // full forward transform and quantization
-        if (x->quant_fp)
-          vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
-        else
-          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
-      } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
-        // fast path forward transform and quantization
-        vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
-      } else {
+    if (x->quant_fp) {
+      // Encoding process for rtc mode
+      if (x->skip_txfm[0] == 1 && plane == 0) {
         // skip forward transform
         p->eobs[block] = 0;
         *a = *l = 0;
         return;
+      } else {
+        vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
       }
     } else {
-      vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      if (max_txsize_lookup[plane_bsize] == tx_size) {
+        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
+        if (x->skip_txfm[txfm_blk_index] == 0) {
+          // full forward transform and quantization
+          vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+        } else if (x->skip_txfm[txfm_blk_index]== 2) {
+          // fast path forward transform and quantization
+          vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
+        } else {
+          // skip forward transform
+          p->eobs[block] = 0;
+          *a = *l = 0;
+          return;
+        }
+      } else {
+        vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
+      }
     }
   }
 
@@ -777,7 +773,7 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
   int plane;
 
@@ -802,12 +798,12 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   }
 }
 
-static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
                                TX_SIZE tx_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
@@ -845,8 +841,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
                                       p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant,
-                                      p->zbin_extra, eob,
+                                      qcoeff, dqcoeff, pd->dequant, eob,
                                       scan_order->scan, scan_order->iscan);
         }
         if (!x->skip_encode && *eob) {
@@ -867,7 +862,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, p->zbin_extra, eob,
+                                pd->dequant, eob,
                                 scan_order->scan, scan_order->iscan);
         }
         if (!x->skip_encode && *eob) {
@@ -889,7 +884,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, p->zbin_extra, eob,
+                                pd->dequant, eob,
                                 scan_order->scan, scan_order->iscan);
         }
         if (!x->skip_encode && *eob) {
@@ -900,7 +895,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
       case TX_4X4:
         tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
         scan_order = &vp9_scan_orders[TX_4X4][tx_type];
-        mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;
+        mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
         vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
@@ -915,7 +910,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
             x->fwd_txm4x4(src_diff, coeff, diff_stride);
           vp9_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, p->zbin_extra, eob,
+                                pd->dequant, eob,
                                 scan_order->scan, scan_order->iscan);
         }
 
@@ -954,7 +949,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                             pd->dequant, eob, scan_order->scan,
                              scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
@@ -974,7 +969,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                        p->quant, p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
@@ -994,7 +989,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
       if (!x->skip_encode && *eob)
@@ -1003,7 +998,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan_order = &vp9_scan_orders[TX_4X4][tx_type];
-      mode = plane == 0 ? get_y_mode(xd->mi[0].src_mi, block) : mbmi->uv_mode;
+      mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                               x->skip_encode ? src : dst,
                               x->skip_encode ? src_stride : dst_stride,
@@ -1018,7 +1013,7 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
           x->fwd_txm4x4(src_diff, coeff, diff_stride);
         vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       pd->dequant, eob, scan_order->scan,
                        scan_order->iscan);
       }
 
@@ -1040,18 +1035,10 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
     *(args->skip) = 0;
 }
 
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            int8_t *skip) {
-  struct encode_b_args arg = {x, NULL, skip};
-  encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
-}
-
-
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args arg = {x, NULL, &xd->mi[0].src_mi->mbmi.skip};
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
 
-  vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block_intra,
-                                         &arg);
+  vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
+                                         vp9_encode_block_intra, &arg);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
index 54d2b375110..97df8a66be7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.h
@@ -18,6 +18,11 @@
 extern "C" {
 #endif
 
+struct encode_b_args {
+  MACROBLOCK *x;
+  struct optimize_ctx *ctx;
+  int8_t *skip;
+};
 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
@@ -29,9 +34,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
 
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-void vp9_encode_block_intra(MACROBLOCK *x, int plane, int block,
-                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                            int8_t *skip);
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
index 08983956763..af73fcbdcc3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c
@@ -161,10 +161,10 @@ static void write_mv_update(const vp9_tree_index *tree,
     update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w) {
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+                         nmv_context_counts *const counts) {
   int i, j;
-  nmv_context *const mvc = &cm->fc.nmvc;
-  nmv_context_counts *const counts = &cm->counts.mv;
+  nmv_context *const mvc = &cm->fc->nmvc;
 
   write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
@@ -241,8 +241,9 @@ static void inc_mvs(const MB_MODE_INFO *mbmi, const int_mv mvs[2],
   }
 }
 
-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
-  const MODE_INFO *mi = xd->mi[0].src_mi;
+void vp9_update_mv_count(ThreadData *td) {
+  const MACROBLOCKD *xd = &td->mb.e_mbd;
+  const MODE_INFO *mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (mbmi->sb_type < BLOCK_8X8) {
@@ -254,12 +255,12 @@ void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
         if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mbmi, mi->bmi[i].as_mv, &cm->counts.mv);
+          inc_mvs(mbmi, mi->bmi[i].as_mv, &td->counts->mv);
       }
     }
   } else {
     if (mbmi->mode == NEWMV)
-      inc_mvs(mbmi, mbmi->mv, &cm->counts.mv);
+      inc_mvs(mbmi, mbmi->mv, &td->counts->mv);
   }
 }
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
index e67f9e3b075..0ae473749ab 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h
@@ -20,7 +20,8 @@ extern "C" {
 
 void vp9_entropy_mv_init();
 
-void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w);
+void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vp9_writer *w,
+                         nmv_context_counts *const counts);
 
 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
                    const nmv_context* mvctx, int usehp);
@@ -28,7 +29,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context* mvctx, int usehp);
 
-void vp9_update_mv_count(VP9_COMMON *cm, const MACROBLOCKD *xd);
+void vp9_update_mv_count(ThreadData *td);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
index 1758e3fdb90..a1018adb88f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c
@@ -13,6 +13,8 @@
 #include <limits.h>
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vpx/internal/vpx_psnr.h"
 #include "vpx_ports/vpx_timer.h"
@@ -35,22 +37,25 @@
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
-#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_resize.h"
 #include "vp9/encoder/vp9_segmentation.h"
+#include "vp9/encoder/vp9_skin_detection.h"
 #include "vp9/encoder/vp9_speed_features.h"
 #if CONFIG_INTERNAL_STATS
 #include "vp9/encoder/vp9_ssim.h"
 #endif
-#include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/vp9_resize.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
 
-void vp9_coef_tree_initialize();
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
 
 #define SHARP_FILTER_QTHRESH 0          /* Q threshold for 8-tap sharp filter */
 
@@ -60,12 +65,14 @@ void vp9_coef_tree_initialize();
                                          // mv. Choose a very high value for
                                          // now so that HIGH_PRECISION is always
                                          // chosen.
-
 // #define OUTPUT_YUV_REC
 
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file = NULL;
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #endif
@@ -102,8 +109,111 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   }
 }
 
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+void vp9_suppress_active_map(VP9_COMP *cpi) {
+  unsigned char *const seg_map = cpi->segmentation_map;
+  int i;
+  if (cpi->active_map.enabled || cpi->active_map.update)
+    for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+      if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+        seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+void vp9_apply_active_map(VP9_COMP *cpi) {
+  struct segmentation *const seg = &cpi->common.seg;
+  unsigned char *const seg_map = cpi->segmentation_map;
+  const unsigned char *const active_map = cpi->active_map.map;
+  int i;
+
+  assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+  if (frame_is_intra_only(&cpi->common)) {
+    cpi->active_map.enabled = 0;
+    cpi->active_map.update = 1;
+  }
+
+  if (cpi->active_map.update) {
+    if (cpi->active_map.enabled) {
+      for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+        if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+      vp9_enable_segmentation(seg);
+      vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp9_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      // Setting the data to -MAX_LOOP_FILTER will result in the computed loop
+      // filter level being zero regardless of the value of seg->abs_delta.
+      vp9_set_segdata(seg, AM_SEGMENT_ID_INACTIVE,
+                      SEG_LVL_ALT_LF, -MAX_LOOP_FILTER);
+    } else {
+      vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+      vp9_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF);
+      if (seg->enabled) {
+        seg->update_data = 1;
+        seg->update_map = 1;
+      }
+    }
+    cpi->active_map.update = 0;
+  }
+}
+
+int vp9_set_active_map(VP9_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+    unsigned char *const active_map_8x8 = cpi->active_map.map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    cpi->active_map.update = 1;
+    if (new_map_16x16) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          active_map_8x8[r * mi_cols + c] =
+              new_map_16x16[(r >> 1) * cols + (c >> 1)]
+                  ? AM_SEGMENT_ID_ACTIVE
+                  : AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+      cpi->active_map.enabled = 1;
+    } else {
+      cpi->active_map.enabled = 0;
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int vp9_get_active_map(VP9_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+      new_map_16x16) {
+    unsigned char* const seg_map_8x8 = cpi->segmentation_map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
-  MACROBLOCK *const mb = &cpi->mb;
+  MACROBLOCK *const mb = &cpi->td.mb;
   cpi->common.allow_high_precision_mv = allow_high_precision_mv;
   if (cpi->common.allow_high_precision_mv) {
     mb->mvcost = mb->nmvcost_hp;
@@ -134,23 +244,87 @@ static void setup_frame(VP9_COMP *cpi) {
     cpi->refresh_alt_ref_frame = 1;
     vp9_zero(cpi->interp_filter_selected);
   } else {
-    cm->fc = cm->frame_contexts[cm->frame_context_idx];
+    *cm->fc = cm->frame_contexts[cm->frame_context_idx];
     vp9_zero(cpi->interp_filter_selected[0]);
   }
 }
 
-void vp9_initialize_enc() {
-  static int init_done = 0;
+static void vp9_enc_setup_mi(VP9_COMMON *cm) {
+  int i;
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+  // Clear top border row
+  memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+  // Clear left border column
+  for (i = 1; i < cm->mi_rows + 1; ++i)
+    memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+
+  memset(cm->mi_grid_base, 0,
+         cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
+}
+
+static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
+  cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
+  if (!cm->mip)
+    return 1;
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(*cm->prev_mip));
+  if (!cm->prev_mip)
+    return 1;
+  cm->mi_alloc_size = mi_size;
+
+  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->mi_grid_base)
+    return 1;
+  cm->prev_mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO*));
+  if (!cm->prev_mi_grid_base)
+    return 1;
+
+  return 0;
+}
+
+static void vp9_enc_free_mi(VP9_COMMON *cm) {
+  vpx_free(cm->mip);
+  cm->mip = NULL;
+  vpx_free(cm->prev_mip);
+  cm->prev_mip = NULL;
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = NULL;
+  vpx_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = NULL;
+}
+
+static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
+  // Current mip will be the prev_mip for the next frame.
+  MODE_INFO **temp_base = cm->prev_mi_grid_base;
+  MODE_INFO *temp = cm->prev_mip;
+  cm->prev_mip = cm->mip;
+  cm->mip = temp;
+
+  // Update the upper left visible macroblock ptrs.
+  cm->mi = cm->mip + cm->mi_stride + 1;
+  cm->prev_mi = cm->prev_mip + cm->mi_stride + 1;
+
+  cm->prev_mi_grid_base = cm->mi_grid_base;
+  cm->mi_grid_base = temp_base;
+  cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
+  cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
+}
+
+void vp9_initialize_enc(void) {
+  static volatile int init_done = 0;
 
   if (!init_done) {
     vp9_rtcd();
+    vpx_dsp_rtcd();
+    vpx_scale_rtcd();
     vp9_init_intra_predictors();
-    vp9_coef_tree_initialize();
-    vp9_tokenize_initialize();
     vp9_init_me_luts();
     vp9_rc_init_minq_luts();
     vp9_entropy_mv_init();
-    vp9_entropy_mode_init();
     vp9_temporal_filter_init();
     init_done = 1;
   }
@@ -160,17 +334,15 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i;
 
+  vpx_free(cpi->tile_data);
+  cpi->tile_data = NULL;
+
   // Delete sementation map
   vpx_free(cpi->segmentation_map);
   cpi->segmentation_map = NULL;
-  vpx_free(cm->last_frame_seg_map);
-  cm->last_frame_seg_map = NULL;
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
   cpi->coding_context.last_frame_seg_map_copy = NULL;
 
-  vpx_free(cpi->complexity_map);
-  cpi->complexity_map = NULL;
-
   vpx_free(cpi->nmvcosts[0]);
   vpx_free(cpi->nmvcosts[1]);
   cpi->nmvcosts[0] = NULL;
@@ -194,7 +366,13 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
-  vp9_free_ref_frame_buffers(cm);
+  vpx_free(cpi->active_map.map);
+  cpi->active_map.map = NULL;
+
+  vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
   vp9_free_context_buffers(cm);
 
   vp9_free_frame_buffer(&cpi->last_frame_uf);
@@ -203,10 +381,10 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vp9_free_frame_buffer(&cpi->alt_ref_buffer);
   vp9_lookahead_destroy(cpi->lookahead);
 
-  vpx_free(cpi->tok);
-  cpi->tok = 0;
+  vpx_free(cpi->tile_tok[0][0]);
+  cpi->tile_tok[0][0] = 0;
 
-  vp9_free_pc_tree(cpi);
+  vp9_free_pc_tree(&cpi->td);
 
   for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
     LAYER_CONTEXT *const lc = &cpi->svc.layer_context[i];
@@ -223,11 +401,11 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   for (i = 0; i < MAX_LAG_BUFFERS; ++i) {
     vp9_free_frame_buffer(&cpi->svc.scaled_frames[i]);
   }
-  vpx_memset(&cpi->svc.scaled_frames[0], 0,
-             MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
+  memset(&cpi->svc.scaled_frames[0], 0,
+         MAX_LAG_BUFFERS * sizeof(cpi->svc.scaled_frames[0]));
 
   vp9_free_frame_buffer(&cpi->svc.empty_frame.img);
-  vpx_memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
+  memset(&cpi->svc.empty_frame, 0, sizeof(cpi->svc.empty_frame));
 }
 
 static void save_coding_context(VP9_COMP *cpi) {
@@ -238,26 +416,26 @@ static void save_coding_context(VP9_COMP *cpi) {
   // restored with a call to vp9_restore_coding_context. These functions are
   // intended for use in a re-code loop in vp9_compress_frame where the
   // quantizer value is adjusted between loop iterations.
-  vp9_copy(cc->nmvjointcost,  cpi->mb.nmvjointcost);
+  vp9_copy(cc->nmvjointcost,  cpi->td.mb.nmvjointcost);
 
-  vpx_memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
-             MV_VALS * sizeof(*cpi->nmvcosts[0]));
-  vpx_memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
-             MV_VALS * sizeof(*cpi->nmvcosts[1]));
-  vpx_memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
-             MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
-  vpx_memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
-             MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
+  memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
+         MV_VALS * sizeof(*cpi->nmvcosts[0]));
+  memcpy(cc->nmvcosts[1], cpi->nmvcosts[1],
+         MV_VALS * sizeof(*cpi->nmvcosts[1]));
+  memcpy(cc->nmvcosts_hp[0], cpi->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[0]));
+  memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
 
   vp9_copy(cc->segment_pred_probs, cm->seg.pred_probs);
 
-  vpx_memcpy(cpi->coding_context.last_frame_seg_map_copy,
-             cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
+  memcpy(cpi->coding_context.last_frame_seg_map_copy,
+         cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cc->last_ref_lf_deltas, cm->lf.last_ref_deltas);
   vp9_copy(cc->last_mode_lf_deltas, cm->lf.last_mode_deltas);
 
-  cc->fc = cm->fc;
+  cc->fc = *cm->fc;
 }
 
 static void restore_coding_context(VP9_COMP *cpi) {
@@ -266,27 +444,25 @@ static void restore_coding_context(VP9_COMP *cpi) {
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp9_save_coding_context.
-  vp9_copy(cpi->mb.nmvjointcost, cc->nmvjointcost);
+  vp9_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
 
-  vpx_memcpy(cpi->nmvcosts[0], cc->nmvcosts[0],
-             MV_VALS * sizeof(*cc->nmvcosts[0]));
-  vpx_memcpy(cpi->nmvcosts[1], cc->nmvcosts[1],
-             MV_VALS * sizeof(*cc->nmvcosts[1]));
-  vpx_memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
-             MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
-  vpx_memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
-             MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
+  memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
+  memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
+  memcpy(cpi->nmvcosts_hp[0], cc->nmvcosts_hp[0],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[0]));
+  memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
+         MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
 
   vp9_copy(cm->seg.pred_probs, cc->segment_pred_probs);
 
-  vpx_memcpy(cm->last_frame_seg_map,
-             cpi->coding_context.last_frame_seg_map_copy,
-             (cm->mi_rows * cm->mi_cols));
+  memcpy(cm->last_frame_seg_map,
+         cpi->coding_context.last_frame_seg_map_copy,
+         (cm->mi_rows * cm->mi_cols));
 
   vp9_copy(cm->lf.last_ref_deltas, cc->last_ref_lf_deltas);
   vp9_copy(cm->lf.last_mode_deltas, cc->last_mode_lf_deltas);
 
-  cm->fc = cc->fc;
+  *cm->fc = cc->fc;
 }
 
 static void configure_static_seg_features(VP9_COMP *cpi) {
@@ -300,7 +476,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
   // Disable and clear down for KF
   if (cm->frame_type == KEY_FRAME) {
     // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     seg->update_map = 0;
     seg->update_data = 0;
     cpi->static_mb_pct = 0;
@@ -313,7 +489,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
   } else if (cpi->refresh_alt_ref_frame) {
     // If this is an alt ref frame
     // Clear down the global segmentation map
-    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+    memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
     seg->update_map = 0;
     seg->update_data = 0;
     cpi->static_mb_pct = 0;
@@ -374,7 +550,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
         vp9_disable_segmentation(seg);
 
-        vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+        memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
         seg->update_map = 0;
         seg->update_data = 0;
@@ -415,15 +591,15 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  MODE_INFO *mi_8x8_ptr = cm->mi;
+  MODE_INFO **mi_8x8_ptr = cm->mi_grid_visible;
   uint8_t *cache_ptr = cm->last_frame_seg_map;
   int row, col;
 
   for (row = 0; row < cm->mi_rows; row++) {
-    MODE_INFO *mi_8x8 = mi_8x8_ptr;
+    MODE_INFO **mi_8x8 = mi_8x8_ptr;
     uint8_t *cache = cache_ptr;
     for (col = 0; col < cm->mi_cols; col++, mi_8x8++, cache++)
-      cache[0] = mi_8x8[0].src_mi->mbmi.segment_id;
+      cache[0] = mi_8x8[0]->mbmi.segment_id;
     mi_8x8_ptr += cm->mi_stride;
     cache_ptr += cm->mi_cols;
   }
@@ -433,8 +609,9 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
 
-  cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
-                                      cm->subsampling_x, cm->subsampling_y,
+  if (!cpi->lookahead)
+    cpi->lookahead = vp9_lookahead_init(oxcf->width, oxcf->height,
+                                        cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                       cm->use_highbitdepth,
 #endif
@@ -443,24 +620,19 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
 
+  // TODO(agrange) Check if ARF is enabled and skip allocation if not.
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                oxcf->width, oxcf->height,
                                cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
 
-static void alloc_ref_frame_buffers(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  if (vp9_alloc_ref_frame_buffers(cm, cm->width, cm->height))
-    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                       "Failed to allocate frame buffers");
-}
-
 static void alloc_util_frame_buffers(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
@@ -469,7 +641,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
@@ -479,7 +652,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled source buffer");
 
@@ -489,7 +663,8 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                cm->use_highbitdepth,
 #endif
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate scaled last source buffer");
 }
@@ -499,19 +674,20 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
 
   vp9_alloc_context_buffers(cm, cm->width, cm->height);
 
-  vpx_free(cpi->tok);
+  vpx_free(cpi->tile_tok[0][0]);
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-    CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+        vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
-  vp9_setup_pc_tree(&cpi->common, cpi);
+  vp9_setup_pc_tree(&cpi->common, &cpi->td);
 }
 
 static void update_frame_size(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
   vp9_set_mb_mi(cm, cm->width, cm->height);
   vp9_init_context_buffers(cm);
@@ -524,7 +700,8 @@ static void update_frame_size(VP9_COMP *cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                  cm->use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL))
       vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                          "Failed to reallocate alt_ref_buffer");
   }
@@ -541,9 +718,14 @@ static void set_tile_limits(VP9_COMP *cpi) {
   int min_log2_tile_cols, max_log2_tile_cols;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
-                             min_log2_tile_cols, max_log2_tile_cols);
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  if (is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING) {
+    cm->log2_tile_cols = 0;
+    cm->log2_tile_rows = 0;
+  } else {
+    cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                               min_log2_tile_cols, max_log2_tile_cols);
+    cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  }
 }
 
 static void init_buffer_indices(VP9_COMP *cpi) {
@@ -563,12 +745,15 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   cm->use_highbitdepth = oxcf->use_highbitdepth;
 #endif
-  cm->color_space = UNKNOWN;
+  cm->color_space = oxcf->color_space;
 
   cm->width = oxcf->width;
   cm->height = oxcf->height;
   vp9_alloc_compressor_data(cpi);
 
+  // Single thread case: use counts in common.
+  cpi->td.counts = &cm->counts;
+
   // Spatial scalability.
   cpi->svc.number_spatial_layers = oxcf->ss_number_layers;
   // Temporal scalability.
@@ -577,7 +762,7 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
   if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass == 2)) {
+       cpi->oxcf.pass != 1)) {
     vp9_init_layer_context(cpi);
   }
 
@@ -746,61 +931,61 @@ static void fnname##_bits12(const uint8_t *src_ptr, \
   sad_array[i] >>= 4; \
 }
 
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x16)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x16_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x16x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x32)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x32x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x32)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x32_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x32x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x64)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x64_avg)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x64x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x32)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad32x32x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x32x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x64)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad64x64x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x64x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x16)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x16x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x16x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x8)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x8x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x8x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x16)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x16x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x16x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x8)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x8x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x8x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x4)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x4x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x4x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x8)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x8x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x8x4d)
-MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x4)
-MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x4x8)
-MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x4x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
 
 static void  highbd_set_var_fns(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -808,398 +993,398 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
     switch (cm->bit_depth) {
       case VPX_BITS_8:
         HIGHBD_BFP(BLOCK_32X16,
-                   vp9_highbd_sad32x16_bits8,
-                   vp9_highbd_sad32x16_avg_bits8,
+                   vpx_highbd_sad32x16_bits8,
+                   vpx_highbd_sad32x16_avg_bits8,
                    vp9_highbd_variance32x16,
                    vp9_highbd_sub_pixel_variance32x16,
                    vp9_highbd_sub_pixel_avg_variance32x16,
                    NULL,
                    NULL,
-                   vp9_highbd_sad32x16x4d_bits8)
+                   vpx_highbd_sad32x16x4d_bits8)
 
         HIGHBD_BFP(BLOCK_16X32,
-                   vp9_highbd_sad16x32_bits8,
-                   vp9_highbd_sad16x32_avg_bits8,
+                   vpx_highbd_sad16x32_bits8,
+                   vpx_highbd_sad16x32_avg_bits8,
                    vp9_highbd_variance16x32,
                    vp9_highbd_sub_pixel_variance16x32,
                    vp9_highbd_sub_pixel_avg_variance16x32,
                    NULL,
                    NULL,
-                   vp9_highbd_sad16x32x4d_bits8)
+                   vpx_highbd_sad16x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_64X32,
-                   vp9_highbd_sad64x32_bits8,
-                   vp9_highbd_sad64x32_avg_bits8,
+                   vpx_highbd_sad64x32_bits8,
+                   vpx_highbd_sad64x32_avg_bits8,
                    vp9_highbd_variance64x32,
                    vp9_highbd_sub_pixel_variance64x32,
                    vp9_highbd_sub_pixel_avg_variance64x32,
                    NULL,
                    NULL,
-                   vp9_highbd_sad64x32x4d_bits8)
+                   vpx_highbd_sad64x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_32X64,
-                   vp9_highbd_sad32x64_bits8,
-                   vp9_highbd_sad32x64_avg_bits8,
+                   vpx_highbd_sad32x64_bits8,
+                   vpx_highbd_sad32x64_avg_bits8,
                    vp9_highbd_variance32x64,
                    vp9_highbd_sub_pixel_variance32x64,
                    vp9_highbd_sub_pixel_avg_variance32x64,
                    NULL,
                    NULL,
-                   vp9_highbd_sad32x64x4d_bits8)
+                   vpx_highbd_sad32x64x4d_bits8)
 
         HIGHBD_BFP(BLOCK_32X32,
-                   vp9_highbd_sad32x32_bits8,
-                   vp9_highbd_sad32x32_avg_bits8,
+                   vpx_highbd_sad32x32_bits8,
+                   vpx_highbd_sad32x32_avg_bits8,
                    vp9_highbd_variance32x32,
                    vp9_highbd_sub_pixel_variance32x32,
                    vp9_highbd_sub_pixel_avg_variance32x32,
-                   vp9_highbd_sad32x32x3_bits8,
-                   vp9_highbd_sad32x32x8_bits8,
-                   vp9_highbd_sad32x32x4d_bits8)
+                   vpx_highbd_sad32x32x3_bits8,
+                   vpx_highbd_sad32x32x8_bits8,
+                   vpx_highbd_sad32x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_64X64,
-                   vp9_highbd_sad64x64_bits8,
-                   vp9_highbd_sad64x64_avg_bits8,
+                   vpx_highbd_sad64x64_bits8,
+                   vpx_highbd_sad64x64_avg_bits8,
                    vp9_highbd_variance64x64,
                    vp9_highbd_sub_pixel_variance64x64,
                    vp9_highbd_sub_pixel_avg_variance64x64,
-                   vp9_highbd_sad64x64x3_bits8,
-                   vp9_highbd_sad64x64x8_bits8,
-                   vp9_highbd_sad64x64x4d_bits8)
+                   vpx_highbd_sad64x64x3_bits8,
+                   vpx_highbd_sad64x64x8_bits8,
+                   vpx_highbd_sad64x64x4d_bits8)
 
         HIGHBD_BFP(BLOCK_16X16,
-                   vp9_highbd_sad16x16_bits8,
-                   vp9_highbd_sad16x16_avg_bits8,
+                   vpx_highbd_sad16x16_bits8,
+                   vpx_highbd_sad16x16_avg_bits8,
                    vp9_highbd_variance16x16,
                    vp9_highbd_sub_pixel_variance16x16,
                    vp9_highbd_sub_pixel_avg_variance16x16,
-                   vp9_highbd_sad16x16x3_bits8,
-                   vp9_highbd_sad16x16x8_bits8,
-                   vp9_highbd_sad16x16x4d_bits8)
+                   vpx_highbd_sad16x16x3_bits8,
+                   vpx_highbd_sad16x16x8_bits8,
+                   vpx_highbd_sad16x16x4d_bits8)
 
         HIGHBD_BFP(BLOCK_16X8,
-                   vp9_highbd_sad16x8_bits8,
-                   vp9_highbd_sad16x8_avg_bits8,
+                   vpx_highbd_sad16x8_bits8,
+                   vpx_highbd_sad16x8_avg_bits8,
                    vp9_highbd_variance16x8,
                    vp9_highbd_sub_pixel_variance16x8,
                    vp9_highbd_sub_pixel_avg_variance16x8,
-                   vp9_highbd_sad16x8x3_bits8,
-                   vp9_highbd_sad16x8x8_bits8,
-                   vp9_highbd_sad16x8x4d_bits8)
+                   vpx_highbd_sad16x8x3_bits8,
+                   vpx_highbd_sad16x8x8_bits8,
+                   vpx_highbd_sad16x8x4d_bits8)
 
         HIGHBD_BFP(BLOCK_8X16,
-                   vp9_highbd_sad8x16_bits8,
-                   vp9_highbd_sad8x16_avg_bits8,
+                   vpx_highbd_sad8x16_bits8,
+                   vpx_highbd_sad8x16_avg_bits8,
                    vp9_highbd_variance8x16,
                    vp9_highbd_sub_pixel_variance8x16,
                    vp9_highbd_sub_pixel_avg_variance8x16,
-                   vp9_highbd_sad8x16x3_bits8,
-                   vp9_highbd_sad8x16x8_bits8,
-                   vp9_highbd_sad8x16x4d_bits8)
+                   vpx_highbd_sad8x16x3_bits8,
+                   vpx_highbd_sad8x16x8_bits8,
+                   vpx_highbd_sad8x16x4d_bits8)
 
         HIGHBD_BFP(BLOCK_8X8,
-                   vp9_highbd_sad8x8_bits8,
-                   vp9_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad8x8_avg_bits8,
                    vp9_highbd_variance8x8,
                    vp9_highbd_sub_pixel_variance8x8,
                    vp9_highbd_sub_pixel_avg_variance8x8,
-                   vp9_highbd_sad8x8x3_bits8,
-                   vp9_highbd_sad8x8x8_bits8,
-                   vp9_highbd_sad8x8x4d_bits8)
+                   vpx_highbd_sad8x8x3_bits8,
+                   vpx_highbd_sad8x8x8_bits8,
+                   vpx_highbd_sad8x8x4d_bits8)
 
         HIGHBD_BFP(BLOCK_8X4,
-                   vp9_highbd_sad8x4_bits8,
-                   vp9_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad8x4_avg_bits8,
                    vp9_highbd_variance8x4,
                    vp9_highbd_sub_pixel_variance8x4,
                    vp9_highbd_sub_pixel_avg_variance8x4,
                    NULL,
-                   vp9_highbd_sad8x4x8_bits8,
-                   vp9_highbd_sad8x4x4d_bits8)
+                   vpx_highbd_sad8x4x8_bits8,
+                   vpx_highbd_sad8x4x4d_bits8)
 
         HIGHBD_BFP(BLOCK_4X8,
-                   vp9_highbd_sad4x8_bits8,
-                   vp9_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad4x8_avg_bits8,
                    vp9_highbd_variance4x8,
                    vp9_highbd_sub_pixel_variance4x8,
                    vp9_highbd_sub_pixel_avg_variance4x8,
                    NULL,
-                   vp9_highbd_sad4x8x8_bits8,
-                   vp9_highbd_sad4x8x4d_bits8)
+                   vpx_highbd_sad4x8x8_bits8,
+                   vpx_highbd_sad4x8x4d_bits8)
 
         HIGHBD_BFP(BLOCK_4X4,
-                   vp9_highbd_sad4x4_bits8,
-                   vp9_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad4x4_avg_bits8,
                    vp9_highbd_variance4x4,
                    vp9_highbd_sub_pixel_variance4x4,
                    vp9_highbd_sub_pixel_avg_variance4x4,
-                   vp9_highbd_sad4x4x3_bits8,
-                   vp9_highbd_sad4x4x8_bits8,
-                   vp9_highbd_sad4x4x4d_bits8)
+                   vpx_highbd_sad4x4x3_bits8,
+                   vpx_highbd_sad4x4x8_bits8,
+                   vpx_highbd_sad4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
         HIGHBD_BFP(BLOCK_32X16,
-                   vp9_highbd_sad32x16_bits10,
-                   vp9_highbd_sad32x16_avg_bits10,
+                   vpx_highbd_sad32x16_bits10,
+                   vpx_highbd_sad32x16_avg_bits10,
                    vp9_highbd_10_variance32x16,
                    vp9_highbd_10_sub_pixel_variance32x16,
                    vp9_highbd_10_sub_pixel_avg_variance32x16,
                    NULL,
                    NULL,
-                   vp9_highbd_sad32x16x4d_bits10)
+                   vpx_highbd_sad32x16x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X32,
-                   vp9_highbd_sad16x32_bits10,
-                   vp9_highbd_sad16x32_avg_bits10,
+                   vpx_highbd_sad16x32_bits10,
+                   vpx_highbd_sad16x32_avg_bits10,
                    vp9_highbd_10_variance16x32,
                    vp9_highbd_10_sub_pixel_variance16x32,
                    vp9_highbd_10_sub_pixel_avg_variance16x32,
                    NULL,
                    NULL,
-                   vp9_highbd_sad16x32x4d_bits10)
+                   vpx_highbd_sad16x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_64X32,
-                   vp9_highbd_sad64x32_bits10,
-                   vp9_highbd_sad64x32_avg_bits10,
+                   vpx_highbd_sad64x32_bits10,
+                   vpx_highbd_sad64x32_avg_bits10,
                    vp9_highbd_10_variance64x32,
                    vp9_highbd_10_sub_pixel_variance64x32,
                    vp9_highbd_10_sub_pixel_avg_variance64x32,
                    NULL,
                    NULL,
-                   vp9_highbd_sad64x32x4d_bits10)
+                   vpx_highbd_sad64x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_32X64,
-                   vp9_highbd_sad32x64_bits10,
-                   vp9_highbd_sad32x64_avg_bits10,
+                   vpx_highbd_sad32x64_bits10,
+                   vpx_highbd_sad32x64_avg_bits10,
                    vp9_highbd_10_variance32x64,
                    vp9_highbd_10_sub_pixel_variance32x64,
                    vp9_highbd_10_sub_pixel_avg_variance32x64,
                    NULL,
                    NULL,
-                   vp9_highbd_sad32x64x4d_bits10)
+                   vpx_highbd_sad32x64x4d_bits10)
 
         HIGHBD_BFP(BLOCK_32X32,
-                   vp9_highbd_sad32x32_bits10,
-                   vp9_highbd_sad32x32_avg_bits10,
+                   vpx_highbd_sad32x32_bits10,
+                   vpx_highbd_sad32x32_avg_bits10,
                    vp9_highbd_10_variance32x32,
                    vp9_highbd_10_sub_pixel_variance32x32,
                    vp9_highbd_10_sub_pixel_avg_variance32x32,
-                   vp9_highbd_sad32x32x3_bits10,
-                   vp9_highbd_sad32x32x8_bits10,
-                   vp9_highbd_sad32x32x4d_bits10)
+                   vpx_highbd_sad32x32x3_bits10,
+                   vpx_highbd_sad32x32x8_bits10,
+                   vpx_highbd_sad32x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_64X64,
-                   vp9_highbd_sad64x64_bits10,
-                   vp9_highbd_sad64x64_avg_bits10,
+                   vpx_highbd_sad64x64_bits10,
+                   vpx_highbd_sad64x64_avg_bits10,
                    vp9_highbd_10_variance64x64,
                    vp9_highbd_10_sub_pixel_variance64x64,
                    vp9_highbd_10_sub_pixel_avg_variance64x64,
-                   vp9_highbd_sad64x64x3_bits10,
-                   vp9_highbd_sad64x64x8_bits10,
-                   vp9_highbd_sad64x64x4d_bits10)
+                   vpx_highbd_sad64x64x3_bits10,
+                   vpx_highbd_sad64x64x8_bits10,
+                   vpx_highbd_sad64x64x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X16,
-                   vp9_highbd_sad16x16_bits10,
-                   vp9_highbd_sad16x16_avg_bits10,
+                   vpx_highbd_sad16x16_bits10,
+                   vpx_highbd_sad16x16_avg_bits10,
                    vp9_highbd_10_variance16x16,
                    vp9_highbd_10_sub_pixel_variance16x16,
                    vp9_highbd_10_sub_pixel_avg_variance16x16,
-                   vp9_highbd_sad16x16x3_bits10,
-                   vp9_highbd_sad16x16x8_bits10,
-                   vp9_highbd_sad16x16x4d_bits10)
+                   vpx_highbd_sad16x16x3_bits10,
+                   vpx_highbd_sad16x16x8_bits10,
+                   vpx_highbd_sad16x16x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X8,
-                   vp9_highbd_sad16x8_bits10,
-                   vp9_highbd_sad16x8_avg_bits10,
+                   vpx_highbd_sad16x8_bits10,
+                   vpx_highbd_sad16x8_avg_bits10,
                    vp9_highbd_10_variance16x8,
                    vp9_highbd_10_sub_pixel_variance16x8,
                    vp9_highbd_10_sub_pixel_avg_variance16x8,
-                   vp9_highbd_sad16x8x3_bits10,
-                   vp9_highbd_sad16x8x8_bits10,
-                   vp9_highbd_sad16x8x4d_bits10)
+                   vpx_highbd_sad16x8x3_bits10,
+                   vpx_highbd_sad16x8x8_bits10,
+                   vpx_highbd_sad16x8x4d_bits10)
 
         HIGHBD_BFP(BLOCK_8X16,
-                   vp9_highbd_sad8x16_bits10,
-                   vp9_highbd_sad8x16_avg_bits10,
+                   vpx_highbd_sad8x16_bits10,
+                   vpx_highbd_sad8x16_avg_bits10,
                    vp9_highbd_10_variance8x16,
                    vp9_highbd_10_sub_pixel_variance8x16,
                    vp9_highbd_10_sub_pixel_avg_variance8x16,
-                   vp9_highbd_sad8x16x3_bits10,
-                   vp9_highbd_sad8x16x8_bits10,
-                   vp9_highbd_sad8x16x4d_bits10)
+                   vpx_highbd_sad8x16x3_bits10,
+                   vpx_highbd_sad8x16x8_bits10,
+                   vpx_highbd_sad8x16x4d_bits10)
 
         HIGHBD_BFP(BLOCK_8X8,
-                   vp9_highbd_sad8x8_bits10,
-                   vp9_highbd_sad8x8_avg_bits10,
+                   vpx_highbd_sad8x8_bits10,
+                   vpx_highbd_sad8x8_avg_bits10,
                    vp9_highbd_10_variance8x8,
                    vp9_highbd_10_sub_pixel_variance8x8,
                    vp9_highbd_10_sub_pixel_avg_variance8x8,
-                   vp9_highbd_sad8x8x3_bits10,
-                   vp9_highbd_sad8x8x8_bits10,
-                   vp9_highbd_sad8x8x4d_bits10)
+                   vpx_highbd_sad8x8x3_bits10,
+                   vpx_highbd_sad8x8x8_bits10,
+                   vpx_highbd_sad8x8x4d_bits10)
 
         HIGHBD_BFP(BLOCK_8X4,
-                   vp9_highbd_sad8x4_bits10,
-                   vp9_highbd_sad8x4_avg_bits10,
+                   vpx_highbd_sad8x4_bits10,
+                   vpx_highbd_sad8x4_avg_bits10,
                    vp9_highbd_10_variance8x4,
                    vp9_highbd_10_sub_pixel_variance8x4,
                    vp9_highbd_10_sub_pixel_avg_variance8x4,
                    NULL,
-                   vp9_highbd_sad8x4x8_bits10,
-                   vp9_highbd_sad8x4x4d_bits10)
+                   vpx_highbd_sad8x4x8_bits10,
+                   vpx_highbd_sad8x4x4d_bits10)
 
         HIGHBD_BFP(BLOCK_4X8,
-                   vp9_highbd_sad4x8_bits10,
-                   vp9_highbd_sad4x8_avg_bits10,
+                   vpx_highbd_sad4x8_bits10,
+                   vpx_highbd_sad4x8_avg_bits10,
                    vp9_highbd_10_variance4x8,
                    vp9_highbd_10_sub_pixel_variance4x8,
                    vp9_highbd_10_sub_pixel_avg_variance4x8,
                    NULL,
-                   vp9_highbd_sad4x8x8_bits10,
-                   vp9_highbd_sad4x8x4d_bits10)
+                   vpx_highbd_sad4x8x8_bits10,
+                   vpx_highbd_sad4x8x4d_bits10)
 
         HIGHBD_BFP(BLOCK_4X4,
-                   vp9_highbd_sad4x4_bits10,
-                   vp9_highbd_sad4x4_avg_bits10,
+                   vpx_highbd_sad4x4_bits10,
+                   vpx_highbd_sad4x4_avg_bits10,
                    vp9_highbd_10_variance4x4,
                    vp9_highbd_10_sub_pixel_variance4x4,
                    vp9_highbd_10_sub_pixel_avg_variance4x4,
-                   vp9_highbd_sad4x4x3_bits10,
-                   vp9_highbd_sad4x4x8_bits10,
-                   vp9_highbd_sad4x4x4d_bits10)
+                   vpx_highbd_sad4x4x3_bits10,
+                   vpx_highbd_sad4x4x8_bits10,
+                   vpx_highbd_sad4x4x4d_bits10)
         break;
 
       case VPX_BITS_12:
         HIGHBD_BFP(BLOCK_32X16,
-                   vp9_highbd_sad32x16_bits12,
-                   vp9_highbd_sad32x16_avg_bits12,
+                   vpx_highbd_sad32x16_bits12,
+                   vpx_highbd_sad32x16_avg_bits12,
                    vp9_highbd_12_variance32x16,
                    vp9_highbd_12_sub_pixel_variance32x16,
                    vp9_highbd_12_sub_pixel_avg_variance32x16,
                    NULL,
                    NULL,
-                   vp9_highbd_sad32x16x4d_bits12)
+                   vpx_highbd_sad32x16x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X32,
-                   vp9_highbd_sad16x32_bits12,
-                   vp9_highbd_sad16x32_avg_bits12,
+                   vpx_highbd_sad16x32_bits12,
+                   vpx_highbd_sad16x32_avg_bits12,
                    vp9_highbd_12_variance16x32,
                    vp9_highbd_12_sub_pixel_variance16x32,
                    vp9_highbd_12_sub_pixel_avg_variance16x32,
                    NULL,
                    NULL,
-                   vp9_highbd_sad16x32x4d_bits12)
+                   vpx_highbd_sad16x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_64X32,
-                   vp9_highbd_sad64x32_bits12,
-                   vp9_highbd_sad64x32_avg_bits12,
+                   vpx_highbd_sad64x32_bits12,
+                   vpx_highbd_sad64x32_avg_bits12,
                    vp9_highbd_12_variance64x32,
                    vp9_highbd_12_sub_pixel_variance64x32,
                    vp9_highbd_12_sub_pixel_avg_variance64x32,
                    NULL,
                    NULL,
-                   vp9_highbd_sad64x32x4d_bits12)
+                   vpx_highbd_sad64x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_32X64,
-                   vp9_highbd_sad32x64_bits12,
-                   vp9_highbd_sad32x64_avg_bits12,
+                   vpx_highbd_sad32x64_bits12,
+                   vpx_highbd_sad32x64_avg_bits12,
                    vp9_highbd_12_variance32x64,
                    vp9_highbd_12_sub_pixel_variance32x64,
                    vp9_highbd_12_sub_pixel_avg_variance32x64,
                    NULL,
                    NULL,
-                   vp9_highbd_sad32x64x4d_bits12)
+                   vpx_highbd_sad32x64x4d_bits12)
 
         HIGHBD_BFP(BLOCK_32X32,
-                   vp9_highbd_sad32x32_bits12,
-                   vp9_highbd_sad32x32_avg_bits12,
+                   vpx_highbd_sad32x32_bits12,
+                   vpx_highbd_sad32x32_avg_bits12,
                    vp9_highbd_12_variance32x32,
                    vp9_highbd_12_sub_pixel_variance32x32,
                    vp9_highbd_12_sub_pixel_avg_variance32x32,
-                   vp9_highbd_sad32x32x3_bits12,
-                   vp9_highbd_sad32x32x8_bits12,
-                   vp9_highbd_sad32x32x4d_bits12)
+                   vpx_highbd_sad32x32x3_bits12,
+                   vpx_highbd_sad32x32x8_bits12,
+                   vpx_highbd_sad32x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_64X64,
-                   vp9_highbd_sad64x64_bits12,
-                   vp9_highbd_sad64x64_avg_bits12,
+                   vpx_highbd_sad64x64_bits12,
+                   vpx_highbd_sad64x64_avg_bits12,
                    vp9_highbd_12_variance64x64,
                    vp9_highbd_12_sub_pixel_variance64x64,
                    vp9_highbd_12_sub_pixel_avg_variance64x64,
-                   vp9_highbd_sad64x64x3_bits12,
-                   vp9_highbd_sad64x64x8_bits12,
-                   vp9_highbd_sad64x64x4d_bits12)
+                   vpx_highbd_sad64x64x3_bits12,
+                   vpx_highbd_sad64x64x8_bits12,
+                   vpx_highbd_sad64x64x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X16,
-                   vp9_highbd_sad16x16_bits12,
-                   vp9_highbd_sad16x16_avg_bits12,
+                   vpx_highbd_sad16x16_bits12,
+                   vpx_highbd_sad16x16_avg_bits12,
                    vp9_highbd_12_variance16x16,
                    vp9_highbd_12_sub_pixel_variance16x16,
                    vp9_highbd_12_sub_pixel_avg_variance16x16,
-                   vp9_highbd_sad16x16x3_bits12,
-                   vp9_highbd_sad16x16x8_bits12,
-                   vp9_highbd_sad16x16x4d_bits12)
+                   vpx_highbd_sad16x16x3_bits12,
+                   vpx_highbd_sad16x16x8_bits12,
+                   vpx_highbd_sad16x16x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X8,
-                   vp9_highbd_sad16x8_bits12,
-                   vp9_highbd_sad16x8_avg_bits12,
+                   vpx_highbd_sad16x8_bits12,
+                   vpx_highbd_sad16x8_avg_bits12,
                    vp9_highbd_12_variance16x8,
                    vp9_highbd_12_sub_pixel_variance16x8,
                    vp9_highbd_12_sub_pixel_avg_variance16x8,
-                   vp9_highbd_sad16x8x3_bits12,
-                   vp9_highbd_sad16x8x8_bits12,
-                   vp9_highbd_sad16x8x4d_bits12)
+                   vpx_highbd_sad16x8x3_bits12,
+                   vpx_highbd_sad16x8x8_bits12,
+                   vpx_highbd_sad16x8x4d_bits12)
 
         HIGHBD_BFP(BLOCK_8X16,
-                   vp9_highbd_sad8x16_bits12,
-                   vp9_highbd_sad8x16_avg_bits12,
+                   vpx_highbd_sad8x16_bits12,
+                   vpx_highbd_sad8x16_avg_bits12,
                    vp9_highbd_12_variance8x16,
                    vp9_highbd_12_sub_pixel_variance8x16,
                    vp9_highbd_12_sub_pixel_avg_variance8x16,
-                   vp9_highbd_sad8x16x3_bits12,
-                   vp9_highbd_sad8x16x8_bits12,
-                   vp9_highbd_sad8x16x4d_bits12)
+                   vpx_highbd_sad8x16x3_bits12,
+                   vpx_highbd_sad8x16x8_bits12,
+                   vpx_highbd_sad8x16x4d_bits12)
 
         HIGHBD_BFP(BLOCK_8X8,
-                   vp9_highbd_sad8x8_bits12,
-                   vp9_highbd_sad8x8_avg_bits12,
+                   vpx_highbd_sad8x8_bits12,
+                   vpx_highbd_sad8x8_avg_bits12,
                    vp9_highbd_12_variance8x8,
                    vp9_highbd_12_sub_pixel_variance8x8,
                    vp9_highbd_12_sub_pixel_avg_variance8x8,
-                   vp9_highbd_sad8x8x3_bits12,
-                   vp9_highbd_sad8x8x8_bits12,
-                   vp9_highbd_sad8x8x4d_bits12)
+                   vpx_highbd_sad8x8x3_bits12,
+                   vpx_highbd_sad8x8x8_bits12,
+                   vpx_highbd_sad8x8x4d_bits12)
 
         HIGHBD_BFP(BLOCK_8X4,
-                   vp9_highbd_sad8x4_bits12,
-                   vp9_highbd_sad8x4_avg_bits12,
+                   vpx_highbd_sad8x4_bits12,
+                   vpx_highbd_sad8x4_avg_bits12,
                    vp9_highbd_12_variance8x4,
                    vp9_highbd_12_sub_pixel_variance8x4,
                    vp9_highbd_12_sub_pixel_avg_variance8x4,
                    NULL,
-                   vp9_highbd_sad8x4x8_bits12,
-                   vp9_highbd_sad8x4x4d_bits12)
+                   vpx_highbd_sad8x4x8_bits12,
+                   vpx_highbd_sad8x4x4d_bits12)
 
         HIGHBD_BFP(BLOCK_4X8,
-                   vp9_highbd_sad4x8_bits12,
-                   vp9_highbd_sad4x8_avg_bits12,
+                   vpx_highbd_sad4x8_bits12,
+                   vpx_highbd_sad4x8_avg_bits12,
                    vp9_highbd_12_variance4x8,
                    vp9_highbd_12_sub_pixel_variance4x8,
                    vp9_highbd_12_sub_pixel_avg_variance4x8,
                    NULL,
-                   vp9_highbd_sad4x8x8_bits12,
-                   vp9_highbd_sad4x8x4d_bits12)
+                   vpx_highbd_sad4x8x8_bits12,
+                   vpx_highbd_sad4x8x4d_bits12)
 
         HIGHBD_BFP(BLOCK_4X4,
-                   vp9_highbd_sad4x4_bits12,
-                   vp9_highbd_sad4x4_avg_bits12,
+                   vpx_highbd_sad4x4_bits12,
+                   vpx_highbd_sad4x4_avg_bits12,
                    vp9_highbd_12_variance4x4,
                    vp9_highbd_12_sub_pixel_variance4x4,
                    vp9_highbd_12_sub_pixel_avg_variance4x4,
-                   vp9_highbd_sad4x4x3_bits12,
-                   vp9_highbd_sad4x4x8_bits12,
-                   vp9_highbd_sad4x4x4d_bits12)
+                   vpx_highbd_sad4x4x3_bits12,
+                   vpx_highbd_sad4x4x8_bits12,
+                   vpx_highbd_sad4x4x4d_bits12)
         break;
 
       default:
@@ -1210,6 +1395,32 @@ static void  highbd_set_var_fns(VP9_COMP *const cpi) {
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+static void realloc_segmentation_maps(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  // Create the encoder segmentation map and set all entries to 0
+  vpx_free(cpi->segmentation_map);
+  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // Create a map used for cyclic background refresh.
+  if (cpi->cyclic_refresh)
+    vp9_cyclic_refresh_free(cpi->cyclic_refresh);
+  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+                  vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+  // Create a map used to mark inactive areas.
+  vpx_free(cpi->active_map.map);
+  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+  // And a place holder structure is the coding context
+  // for use if we want to save and restore it
+  vpx_free(cpi->coding_context.last_frame_seg_map_copy);
+  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
 void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1217,6 +1428,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   if (cm->profile != oxcf->profile)
     cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
+  cm->color_space = oxcf->color_space;
 
   if (cm->profile <= PROFILE_1)
     assert(cm->bit_depth == VPX_BITS_8);
@@ -1225,7 +1437,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   cpi->oxcf = *oxcf;
 #if CONFIG_VP9_HIGHBITDEPTH
-  cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+  cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -1264,13 +1476,16 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 
   cm->display_width = cpi->oxcf.width;
   cm->display_height = cpi->oxcf.height;
+  cm->width = cpi->oxcf.width;
+  cm->height = cpi->oxcf.height;
 
   if (cpi->initial_width) {
-    // Increasing the size of the frame beyond the first seen frame, or some
-    // otherwise signaled maximum size, is not supported.
-    // TODO(jkoleszar): exit gracefully.
-    assert(cm->width <= cpi->initial_width);
-    assert(cm->height <= cpi->initial_height);
+    if (cm->width > cpi->initial_width || cm->height > cpi->initial_height) {
+      vp9_free_context_buffers(cm);
+      vp9_alloc_compressor_data(cpi);
+      realloc_segmentation_maps(cpi);
+      cpi->initial_width = cpi->initial_height = 0;
+    }
   }
   update_frame_size(cpi);
 
@@ -1278,7 +1493,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
       cpi->oxcf.rc_mode == VPX_CBR) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
-       cpi->oxcf.pass == 2)) {
+       cpi->oxcf.pass != 1)) {
     vp9_update_layer_context_change_config(cpi,
                                            (int)cpi->oxcf.target_bandwidth);
   }
@@ -1300,17 +1515,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
-                       cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                       cm->use_highbitdepth,
-#endif
-                       VP9_ENC_BORDER_IN_PIXELS);
-  }
-#endif
 }
 
 #ifndef M_LOG2_E
@@ -1356,10 +1560,11 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
 }
 
 
-VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
-  unsigned int i, j;
-  VP9_COMP *const cpi = vpx_memalign(32, sizeof(VP9_COMP));
-  VP9_COMMON *const cm = cpi != NULL ? &cpi->common : NULL;
+VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                BufferPool *const pool) {
+  unsigned int i;
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
   if (!cm)
     return NULL;
@@ -1373,31 +1578,27 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   }
 
   cm->error.setjmp = 1;
+  cm->alloc_mi = vp9_enc_alloc_mi;
+  cm->free_mi = vp9_enc_free_mi;
+  cm->setup_mi = vp9_enc_setup_mi;
+
+  CHECK_MEM_ERROR(cm, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(cm, cm->frame_contexts,
+                  (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS,
+                  sizeof(*cm->frame_contexts)));
 
   cpi->use_svc = 0;
+  cpi->common.buffer_pool = pool;
 
   init_config(cpi, oxcf);
   vp9_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
 
   cm->current_video_frame = 0;
   cpi->partition_search_skippable_frame = 0;
+  cpi->tile_data = NULL;
 
-  // Create the encoder segmentation map and set all entries to 0
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
-  // Create a complexity map used for rd adjustment
-  CHECK_MEM_ERROR(cm, cpi->complexity_map,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
-
-  // Create a map used for cyclic background refresh.
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
-                  vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
-
-  // And a place holder structure is the coding context
-  // for use if we want to save and restore it
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
-                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+  realloc_segmentation_maps(cpi);
 
   CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
@@ -1435,45 +1636,24 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
 #endif
 
   cpi->refresh_alt_ref_frame = 0;
-
-  // Note that at the moment multi_arf will not work with svc.
-  // For the current check in all the execution paths are defaulted to 0
-  // pending further tuning and testing. The code is left in place here
-  // as a place holder in regard to the required paths.
   cpi->multi_arf_last_grp_enabled = 0;
-  if (oxcf->pass == 2) {
-    if (cpi->use_svc) {
-      cpi->multi_arf_allowed = 0;
-      cpi->multi_arf_enabled = 0;
-    } else {
-      // Disable by default for now.
-      cpi->multi_arf_allowed = 0;
-      cpi->multi_arf_enabled = 0;
-    }
-  } else {
-    cpi->multi_arf_allowed = 0;
-    cpi->multi_arf_enabled = 0;
-  }
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
   cpi->b_calculate_ssimg = 0;
+  cpi->b_calculate_blockiness = 1;
+  cpi->b_calculate_consistency = 1;
+  cpi->total_inconsistency = 0;
+  cpi->psnr.worst = 100.0;
+  cpi->worst_ssim = 100.0;
 
   cpi->count = 0;
   cpi->bytes = 0;
 
   if (cpi->b_calculate_psnr) {
-    cpi->total_y = 0.0;
-    cpi->total_u = 0.0;
-    cpi->total_v = 0.0;
-    cpi->total = 0.0;
     cpi->total_sq_error = 0;
     cpi->total_samples = 0;
 
-    cpi->totalp_y = 0.0;
-    cpi->totalp_u = 0.0;
-    cpi->totalp_v = 0.0;
-    cpi->totalp = 0.0;
     cpi->totalp_sq_error = 0;
     cpi->totalp_samples = 0;
 
@@ -1485,34 +1665,47 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   }
 
   if (cpi->b_calculate_ssimg) {
-    cpi->total_ssimg_y = 0;
-    cpi->total_ssimg_u = 0;
-    cpi->total_ssimg_v = 0;
-    cpi->total_ssimg_all = 0;
+    cpi->ssimg.worst= 100.0;
+  }
+  cpi->fastssim.worst = 100.0;
+
+  cpi->psnrhvs.worst = 100.0;
+
+  if (cpi->b_calculate_blockiness) {
+    cpi->total_blockiness = 0;
+    cpi->worst_blockiness = 0.0;
+  }
+
+  if (cpi->b_calculate_consistency) {
+    cpi->ssim_vars = vpx_malloc(sizeof(*cpi->ssim_vars)*720*480);
+    cpi->worst_consistency = 100.0;
   }
 
 #endif
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-  cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
-  cpi->mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
-  cpi->mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
-  cpi->mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
-  cpi->mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
-  cal_nmvsadcosts(cpi->mb.nmvsadcost);
+  cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
+  cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
+  cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
+  cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
+  cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
 
-  cpi->mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
-  cpi->mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
-  cpi->mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
-  cpi->mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
-  cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
+  cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
@@ -1589,7 +1782,8 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
     }
   }
 
-  vp9_set_speed_features(cpi);
+  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_speed_features_framesize_dependent(cpi);
 
   // Allocate memory to store variances for a frame.
   CHECK_MEM_ERROR(cm, cpi->source_diff_var,
@@ -1597,14 +1791,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
 
-  // Default rd threshold factors for mode selection
-  for (i = 0; i < BLOCK_SIZES; ++i) {
-    for (j = 0; j < MAX_MODES; ++j) {
-      cpi->rd.thresh_freq_fact[i][j] = 32;
-      cpi->rd.mode_map[i][j] = j;
-    }
-  }
-
 #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].sdaf           = SDAF; \
@@ -1615,64 +1801,64 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
-  BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
       vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d)
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
 
-  BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
       vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d)
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
 
-  BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
       vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d)
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
 
-  BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
       vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d)
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
 
-  BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
       vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8,
-      vp9_sad32x32x4d)
+      vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+      vpx_sad32x32x4d)
 
-  BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
       vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8,
-      vp9_sad64x64x4d)
+      vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+      vpx_sad64x64x4d)
 
-  BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
       vp9_variance16x16, vp9_sub_pixel_variance16x16,
-      vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8,
-      vp9_sad16x16x4d)
+      vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+      vpx_sad16x16x4d)
 
-  BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
       vp9_variance16x8, vp9_sub_pixel_variance16x8,
       vp9_sub_pixel_avg_variance16x8,
-      vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+      vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
 
-  BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
       vp9_variance8x16, vp9_sub_pixel_variance8x16,
       vp9_sub_pixel_avg_variance8x16,
-      vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+      vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
 
-  BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
       vp9_variance8x8, vp9_sub_pixel_variance8x8,
       vp9_sub_pixel_avg_variance8x8,
-      vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+      vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
 
-  BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
       vp9_variance8x4, vp9_sub_pixel_variance8x4,
-      vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d)
+      vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
 
-  BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
       vp9_variance4x8, vp9_sub_pixel_variance4x8,
-      vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d)
+      vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
 
-  BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
       vp9_variance4x4, vp9_sub_pixel_variance4x4,
       vp9_sub_pixel_avg_variance4x4,
-      vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+      vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -1691,20 +1877,27 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) {
 
   return cpi;
 }
+#define SNPRINT(H, T) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+  snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 
 void vp9_remove_compressor(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   unsigned int i;
+  int t;
 
   if (!cpi)
     return;
 
-  if (cpi && (cpi->common.current_video_frame > 0)) {
+  if (cpi && (cm->current_video_frame > 0)) {
 #if CONFIG_INTERNAL_STATS
-
     vp9_clear_system_state();
 
-    // printf("\n8x8-4x4:%d-%d\n", cpi->t8x8_count, cpi->t4x4_count);
     if (cpi->oxcf.pass != 1) {
+      char headings[512] = {0};
+      char results[512] = {0};
       FILE *f = fopen("opsnr.stt", "a");
       double time_encoded = (cpi->last_end_time_stamp_seen
                              - cpi->first_time_stamp_ever) / 10000000.000;
@@ -1722,25 +1915,50 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
             vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
                             (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
-                                                cpi->summed_weights, 8.0);
+                                            cpi->summed_weights, 8.0);
         const double totalp_ssim = 100 * pow(cpi->summedp_quality /
-                                                cpi->summedp_weights, 8.0);
-
-        fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
-                "VPXSSIM\tVPSSIMP\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
-                dr, cpi->total / cpi->count, total_psnr,
-                cpi->totalp / cpi->count, totalp_psnr, total_ssim, totalp_ssim,
-                total_encode_time);
-      }
+                                             cpi->summedp_weights, 8.0);
+
+        snprintf(headings, sizeof(headings),
+                 "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+                 "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+        snprintf(results, sizeof(results),
+                 "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f"
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+                 cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr,
+                 total_ssim, totalp_ssim,
+                 cpi->fastssim.stat[ALL] / cpi->count,
+                 cpi->psnrhvs.stat[ALL] / cpi->count,
+                 cpi->psnr.worst, cpi->worst_ssim, cpi->fastssim.worst,
+                 cpi->psnrhvs.worst);
+
+        if (cpi->b_calculate_blockiness) {
+          SNPRINT(headings, "\t  Block\tWstBlck");
+          SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+        }
 
-      if (cpi->b_calculate_ssimg) {
-        fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(ms)\n");
-        fprintf(f, "%7.2f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
-                cpi->total_ssimg_y / cpi->count,
-                cpi->total_ssimg_u / cpi->count,
-                cpi->total_ssimg_v / cpi->count,
-                cpi->total_ssimg_all / cpi->count, total_encode_time);
+        if (cpi->b_calculate_consistency) {
+          double consistency =
+              vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+                              (double)cpi->total_inconsistency);
+
+          SNPRINT(headings, "\tConsist\tWstCons");
+          SNPRINT2(results, "\t%7.3f", consistency);
+          SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+        }
+
+        if (cpi->b_calculate_ssimg) {
+          SNPRINT(headings, "\t  SSIMG\tWtSSIMG");
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.stat[ALL] / cpi->count);
+          SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
+        }
+
+        fprintf(f, "%s\t    Time\n", headings);
+        fprintf(f, "%s\t%8.0f\n", results, total_encode_time);
       }
 
       fclose(f);
@@ -1761,13 +1979,30 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0) {
-    vp9_denoiser_free(&(cpi->denoiser));
-  }
+  vp9_denoiser_free(&(cpi->denoiser));
 #endif
 
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VP9Worker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vp9_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  vpx_free(cpi->workers);
+
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+
   dealloc_compressor_data(cpi);
-  vpx_free(cpi->tok);
 
   for (i = 0; i < sizeof(cpi->mbgraph_stats) /
                   sizeof(cpi->mbgraph_stats[0]); ++i) {
@@ -1781,7 +2016,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   }
 #endif
 
-  vp9_remove_common(&cpi->common);
+  vp9_remove_common(cm);
+  vp9_free_ref_frame_buffers(cm->buffer_pool);
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
   vpx_free(cpi);
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -1789,6 +2028,9 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   fclose(yuv_denoised_file);
 #endif
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
@@ -1914,11 +2156,13 @@ typedef struct {
 static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
                       PSNR_STATS *psnr) {
   static const double peak = 255.0;
-  const int widths[3]        = {a->y_width,  a->uv_width,  a->uv_width };
-  const int heights[3]       = {a->y_height, a->uv_height, a->uv_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
+  const int widths[3]        = {
+      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
+  const int heights[3]       = {
+      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
+  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
   const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
+  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
   const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
   int i;
   uint64_t total_sse = 0;
@@ -1951,8 +2195,10 @@ static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
                              PSNR_STATS *psnr,
                              unsigned int bit_depth,
                              unsigned int in_bit_depth) {
-  const int widths[3] = {a->y_width,  a->uv_width,  a->uv_width };
-  const int heights[3] = {a->y_height, a->uv_height, a->uv_height};
+  const int widths[3] =
+      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
+  const int heights[3] =
+      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
   const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
   const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
   const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
@@ -2003,7 +2249,7 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
   PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
   calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
-                   cpi->mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+                   cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
 #else
   calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
 #endif
@@ -2076,8 +2322,7 @@ int vp9_update_entropy(VP9_COMP * cpi, int update) {
   return 0;
 }
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-#if defined(OUTPUT_YUV_DENOISED)
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
 // The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
 // as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
 // not denoise the UV channels at this time. If ever we implement UV channel
@@ -2092,23 +2337,22 @@ void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
   } while (--h);
 
   src = s->u_buffer;
-  h = s->uv_height / 2;
+  h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
   } while (--h);
 
   src = s->v_buffer;
-  h = s->uv_height / 2;
+  h = s->uv_height;
 
   do {
-    fwrite(src, s->uv_width / 2, 1, f);
-    src += s->uv_stride + s->uv_width / 2;
+    fwrite(src, s->uv_width, 1, f);
+    src += s->uv_stride;
   } while (--h);
 }
 #endif
-#endif
 
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
@@ -2267,28 +2511,43 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   vp9_extend_frame_borders(dst);
 }
 
+static int scale_down(VP9_COMP *cpi, int q) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  int scale = 0;
+  assert(frame_is_kf_gf_arf(cpi));
+
+  if (rc->frame_size_selector == UNSCALED &&
+      q >= rc->rf_level_maxq[gf_group->rf_level[gf_group->index]]) {
+    const int max_size_thresh = (int)(rate_thresh_mult[SCALE_STEP1]
+        * MAX(rc->this_frame_target, rc->avg_frame_bandwidth));
+    scale = rc->projected_frame_size > max_size_thresh ? 1 : 0;
+  }
+  return scale;
+}
+
 // Function to test for conditions that indicate we should loop
 // back and recode a frame.
-static int recode_loop_test(const VP9_COMP *cpi,
+static int recode_loop_test(VP9_COMP *cpi,
                             int high_limit, int low_limit,
                             int q, int maxq, int minq) {
-  const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
   int force_recode = 0;
 
-  // Special case trap if maximum allowed frame size exceeded.
-  if (rc->projected_frame_size > rc->max_frame_bandwidth) {
-    force_recode = 1;
-
-  // Is frame recode allowed.
-  // Yes if either recode mode 1 is selected or mode 2 is selected
-  // and the frame is a key frame, golden frame or alt_ref_frame
-  } else if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
-             ((cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF) &&
-              (cm->frame_type == KEY_FRAME ||
-               cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
-    // General over and under shoot tests
+  if ((cpi->sf.recode_loop == ALLOW_RECODE) ||
+      (frame_is_kfgfarf &&
+      (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+    if (frame_is_kfgfarf &&
+        (oxcf->resize_mode == RESIZE_DYNAMIC) &&
+        scale_down(cpi, q)) {
+        // Code this group at a lower resolution.
+        cpi->resize_pending = 1;
+        return 1;
+    }
+
+    // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
         (rc->projected_frame_size < low_limit && q > minq)) {
       force_recode = 1;
@@ -2306,13 +2565,14 @@ static int recode_loop_test(const VP9_COMP *cpi,
 
 void vp9_update_reference_frames(VP9_COMP *cpi) {
   VP9_COMMON * const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
   } else if (vp9_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
@@ -2325,7 +2585,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     // slot and, if we're updating the GF, the current frame becomes the new GF.
     int tmp;
 
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
 
     tmp = cpi->alt_fb_idx;
@@ -2344,34 +2604,34 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
         arf_idx = gf_group->arf_update_idx[gf_group->index];
       }
 
-      ref_cnt_fb(cm->frame_bufs,
+      ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
-      vpx_memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
-                 cpi->interp_filter_selected[0],
-                 sizeof(cpi->interp_filter_selected[0]));
+      memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
     }
 
     if (cpi->refresh_golden_frame) {
-      ref_cnt_fb(cm->frame_bufs,
+      ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
       if (!cpi->rc.is_src_frame_alt_ref)
-        vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-                   cpi->interp_filter_selected[0],
-                   sizeof(cpi->interp_filter_selected[0]));
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[0],
+               sizeof(cpi->interp_filter_selected[0]));
       else
-        vpx_memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
-                   cpi->interp_filter_selected[ALTREF_FRAME],
-                   sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
+        memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+               cpi->interp_filter_selected[ALTREF_FRAME],
+               sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
     }
   }
 
   if (cpi->refresh_last_frame) {
-    ref_cnt_fb(cm->frame_bufs,
+    ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
     if (!cpi->rc.is_src_frame_alt_ref)
-      vpx_memcpy(cpi->interp_filter_selected[LAST_FRAME],
-                 cpi->interp_filter_selected[0],
-                 sizeof(cpi->interp_filter_selected[0]));
+      memcpy(cpi->interp_filter_selected[LAST_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -2386,7 +2646,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   struct loopfilter *lf = &cm->lf;
   if (xd->lossless) {
       lf->filter_level = 0;
@@ -2404,42 +2664,83 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   }
 
   if (lf->filter_level > 0) {
-    vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+    if (cpi->num_workers > 1)
+      vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
+                               lf->filter_level, 0, 0,
+                               cpi->workers, cpi->num_workers,
+                               &cpi->lf_row_sync);
+    else
+      vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
   }
 
   vp9_extend_frame_inner_borders(cm->frame_to_show);
 }
 
+static INLINE void alloc_frame_mvs(const VP9_COMMON *cm,
+                                   int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL ||
+      new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    new_fb_ptr->mvs =
+      (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                           sizeof(*new_fb_ptr->mvs));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
 void vp9_scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
   const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
-
     // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
-    if ((cpi->ref_frame_flags & ref_mask[ref_frame - 1]) &&
-        (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height)) {
-      const int new_fb = get_free_fb(cm);
-      vp9_realloc_frame_buffer(&cm->frame_bufs[new_fb].buf,
-                               cm->width, cm->height,
-                               cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                               cm->use_highbitdepth,
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-                               VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+    if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+      BufferPool *const pool = cm->buffer_pool;
+      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+                                                                 ref_frame);
+
+      if (ref == NULL) {
+        cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+        continue;
+      }
+
 #if CONFIG_VP9_HIGHBITDEPTH
-      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf,
-                             (int)cm->bit_depth);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        const int new_fb = get_free_fb(cm);
+        RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+        cm->cur_frame = &pool->frame_bufs[new_fb];
+        vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf,
+                                 cm->width, cm->height,
+                                 cm->subsampling_x, cm->subsampling_y,
+                                 cm->use_highbitdepth,
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL);
+        scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
 #else
-      scale_and_extend_frame(ref, &cm->frame_bufs[new_fb].buf);
+      if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+        const int new_fb = get_free_fb(cm);
+        RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+        vp9_realloc_frame_buffer(&new_fb_ptr->buf,
+                                 cm->width, cm->height,
+                                 cm->subsampling_x, cm->subsampling_y,
+                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                 NULL, NULL, NULL);
+        scale_and_extend_frame(ref, &new_fb_ptr->buf);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+        cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+
+        alloc_frame_mvs(cm, new_fb);
+      } else {
+        const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+        cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+        ++pool->frame_bufs[buf_idx].ref_count;
+      }
     } else {
-      cpi->scaled_ref_idx[ref_frame - 1] = idx;
-      cm->frame_bufs[idx].ref_count++;
+      cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
     }
   }
 }
@@ -2447,9 +2748,15 @@ void vp9_scale_references(VP9_COMP *cpi) {
 static void release_scaled_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i;
-
-  for (i = 0; i < 3; i++)
-    cm->frame_bufs[cpi->scaled_ref_idx[i]].ref_count--;
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    const int idx = cpi->scaled_ref_idx[i];
+    RefCntBuffer *const buf = idx != INVALID_IDX ?
+        &cm->buffer_pool->frame_bufs[idx] : NULL;
+    if (buf != NULL) {
+      --buf->ref_count;
+      cpi->scaled_ref_idx[i] = INVALID_IDX;
+    }
+  }
 }
 
 static void full_to_model_count(unsigned int *model_count,
@@ -2478,20 +2785,22 @@ static void full_to_model_counts(vp9_coeff_count_model *model_count,
 static void output_frame_level_debug_stats(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   FILE *const f = fopen("tmp.stt", cm->current_video_frame ? "a" : "w");
-  int recon_err;
+  int64_t recon_err;
 
   vp9_clear_system_state();
 
   recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %10d %10d %10d %10d"
         "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d "
         "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
         "%10"PRId64" %10.3lf"
-        "%10lf %8u %10d %10d %10d\n",
-        cpi->common.current_video_frame, cpi->rc.this_frame_target,
+        "%10lf %8u %10"PRId64" %10d %10d\n",
+        cpi->common.current_video_frame,
+        cm->width, cm->height,
+        cpi->rc.this_frame_target,
         cpi->rc.projected_frame_size,
         cpi->rc.projected_frame_size / cpi->common.MBs,
         (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
@@ -2534,12 +2843,194 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 }
 #endif
 
-static void encode_without_recode_loop(VP9_COMP *cpi,
-                                       int q) {
+static void set_mv_search_params(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const unsigned int max_mv_def = MIN(cm->width, cm->height);
+
+  // Default based on max resolution.
+  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
+
+  if (cpi->sf.mv.auto_mv_step_size) {
+    if (frame_is_intra_only(cm)) {
+      // Initialize max_mv_magnitude for use in the first INTER frame
+      // after a key/intra-only frame.
+      cpi->max_mv_magnitude = max_mv_def;
+    } else {
+      if (cm->show_frame) {
+        // Allow mv_steps to correspond to twice the max mv magnitude found
+        // in the previous frame, capped by the default max_mv_magnitude based
+        // on resolution.
+        cpi->mv_step_param =
+            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+      }
+      cpi->max_mv_magnitude = 0;
+    }
+  }
+}
+
+static void set_size_independent_vars(VP9_COMP *cpi) {
+  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_rd_speed_thresholds(cpi);
+  vp9_set_rd_speed_thresholds_sub8x8(cpi);
+  cpi->common.interp_filter = cpi->sf.default_interp_filter;
+}
+
+static void set_size_dependent_vars(VP9_COMP *cpi, int *q,
+                                    int *bottom_index, int *top_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+
+  // Setup variables that depend on the dimensions of the frame.
+  vp9_set_speed_features_framesize_dependent(cpi);
+
+  // Decide q and q bounds.
+  *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
+
+  if (!frame_is_intra_only(cm)) {
+    vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
+  }
+
+  // Configure experimental use of segmentation for enhanced coding of
+  // static regions if indicated.
+  // Only allowed in the second pass of a two pass encode, as it requires
+  // lagged coding, and if the relevant speed feature flag is set.
+  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+    configure_static_seg_features(cpi);
+
+#if CONFIG_VP9_POSTPROC
+  if (oxcf->noise_sensitivity > 0) {
+    int l = 0;
+    switch (oxcf->noise_sensitivity) {
+      case 1:
+        l = 20;
+        break;
+      case 2:
+        l = 40;
+        break;
+      case 3:
+        l = 60;
+        break;
+      case 4:
+      case 5:
+        l = 100;
+        break;
+      case 6:
+        l = 150;
+        break;
+    }
+    vp9_denoise(cpi->Source, cpi->Source, l);
+  }
+#endif  // CONFIG_VP9_POSTPROC
+}
+
+static void init_motion_estimation(VP9_COMP *cpi) {
+  int y_stride = cpi->scaled_source.y_stride;
+
+  if (cpi->sf.mv.search_method == NSTEP) {
+    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+  } else if (cpi->sf.mv.search_method == DIAMOND) {
+    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+  }
+}
+
+void set_frame_size(VP9_COMP *cpi) {
+  int ref_frame;
   VP9_COMMON *const cm = &cpi->common;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+  if (oxcf->pass == 2 &&
+      oxcf->rc_mode == VPX_VBR &&
+      ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
+        (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
+    calculate_coded_size(
+        cpi, &oxcf->scaled_frame_width, &oxcf->scaled_frame_height);
+
+    // There has been a change in frame size.
+    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
+                         oxcf->scaled_frame_height);
+  }
+
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    vp9_set_target_rate(cpi);
+  }
+
+  alloc_frame_mvs(cm, cm->new_fb_idx);
+
+  // Reset the frame pointers to the current frame size.
+  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
+                           cm->width, cm->height,
+                           cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                           cm->use_highbitdepth,
+#endif
+                           VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                           NULL, NULL, NULL);
+
+  alloc_util_frame_buffers(cpi);
+  init_motion_estimation(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+    ref_buf->idx = buf_idx;
+
+    if (buf_idx != INVALID_IDX) {
+      YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+      ref_buf->buf = buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height,
+                                        (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+                                            1 : 0);
+#else
+      vp9_setup_scale_factors_for_frame(&ref_buf->sf,
+                                        buf->y_crop_width, buf->y_crop_height,
+                                        cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      if (vp9_is_scaled(&ref_buf->sf))
+        vp9_extend_frame_borders(buf);
+    } else {
+      ref_buf->buf = NULL;
+    }
+  }
+
+  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static void encode_without_recode_loop(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+
   vp9_clear_system_state();
+
+  set_frame_size(cpi);
+
+  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                      &cpi->scaled_source);
+
+  if (cpi->unscaled_last_source != NULL)
+    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                             &cpi->scaled_last_source);
+
+  if (frame_is_intra_only(cm) == 0) {
+    vp9_scale_references(cpi);
+  }
+
+  set_size_independent_vars(cpi);
+  set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
   vp9_set_quantizer(cm, q);
+  vp9_set_vbp_thresholds(cpi, q);
+
   setup_frame(cpi);
+
+  vp9_suppress_active_map(cpi);
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -2549,9 +3040,19 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
   } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
     vp9_cyclic_refresh_setup(cpi);
   }
+  vp9_apply_active_map(cpi);
+
   // transform / motion compensation build reconstruction frame
   vp9_encode_frame(cpi);
 
+  // Update some stats from cyclic refresh, and check if we should not update
+  // golden reference, for non-SVC 1 pass CBR.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      cm->frame_type != KEY_FRAME &&
+      !cpi->use_svc &&
+      (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR))
+    vp9_cyclic_refresh_check_golden_update(cpi);
+
   // Update the skip mb flag probabilities based on the distribution
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
@@ -2560,28 +3061,66 @@ static void encode_without_recode_loop(VP9_COMP *cpi,
 
 static void encode_with_recode_loop(VP9_COMP *cpi,
                                     size_t *size,
-                                    uint8_t *dest,
-                                    int q,
-                                    int bottom_index,
-                                    int top_index) {
+                                    uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  int bottom_index, top_index;
   int loop_count = 0;
+  int loop_at_this_size = 0;
   int loop = 0;
   int overshoot_seen = 0;
   int undershoot_seen = 0;
-  int q_low = bottom_index, q_high = top_index;
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
+  int q = 0, q_low = 0, q_high = 0;
 
-  // Decide frame size bounds
-  vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
-                                   &frame_under_shoot_limit,
-                                   &frame_over_shoot_limit);
+  set_size_independent_vars(cpi);
 
   do {
     vp9_clear_system_state();
 
+    set_frame_size(cpi);
+
+    if (loop_count == 0 || cpi->resize_pending != 0) {
+      set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+      // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+      set_mv_search_params(cpi);
+
+      // Reset the loop state for new frame size.
+      overshoot_seen = 0;
+      undershoot_seen = 0;
+
+      // Reconfiguration for change in frame size has concluded.
+      cpi->resize_pending = 0;
+
+      q_low = bottom_index;
+      q_high = top_index;
+
+      loop_at_this_size = 0;
+    }
+
+    // Decide frame size bounds first time through.
+    if (loop_count == 0) {
+      vp9_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+                                       &frame_under_shoot_limit,
+                                       &frame_over_shoot_limit);
+    }
+
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                      &cpi->scaled_source);
+
+    if (cpi->unscaled_last_source != NULL)
+      cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source);
+
+    if (frame_is_intra_only(cm) == 0) {
+      if (loop_count > 0) {
+        release_scaled_references(cpi);
+      }
+      vp9_scale_references(cpi);
+    }
+
     vp9_set_quantizer(cm, q);
 
     if (loop_count == 0)
@@ -2626,15 +3165,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
            rc->this_key_frame_forced &&
            (rc->projected_frame_size < rc->max_frame_bandwidth)) {
         int last_q = q;
-        int kf_err;
+        int64_t kf_err;
 
-        int high_err_target = cpi->ambient_err;
-        int low_err_target = cpi->ambient_err >> 1;
+        int64_t high_err_target = cpi->ambient_err;
+        int64_t low_err_target = cpi->ambient_err >> 1;
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm),
-                                        cm->bit_depth);
+          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         } else {
           kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         }
@@ -2655,7 +3193,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           q_high = q > q_low ? q - 1 : q_low;
 
           // Adjust Q
-          q = (q * high_err_target) / kf_err;
+          q = (int)((q * high_err_target) / kf_err);
           q = MIN(q, (q_high + q_low) >> 1);
         } else if (kf_err < low_err_target &&
                    rc->projected_frame_size >= frame_under_shoot_limit) {
@@ -2664,7 +3202,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           q_low = q < q_high ? q + 1 : q_high;
 
           // Adjust Q
-          q = (q * low_err_target) / kf_err;
+          q = (int)((q * low_err_target) / kf_err);
           q = MIN(q, (q_high + q_low + 1) >> 1);
         }
 
@@ -2680,6 +3218,20 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
         int last_q = q;
         int retries = 0;
 
+        if (cpi->resize_pending == 1) {
+          // Change in frame size so go back around the recode loop.
+          cpi->rc.frame_size_selector =
+              SCALE_STEP1 - cpi->rc.frame_size_selector;
+          cpi->rc.next_frame_size_selector = cpi->rc.frame_size_selector;
+
+#if CONFIG_INTERNAL_STATS
+          ++cpi->tot_recode_hits;
+#endif
+          ++loop_count;
+          loop = 1;
+          continue;
+        }
+
         // Frame size out of permitted range:
         // Update correction factor & compute new Q to try...
 
@@ -2692,20 +3244,20 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           // Raise Qlow as to at least the current value
           q_low = q < q_high ? q + 1 : q_high;
 
-          if (undershoot_seen || loop_count > 1) {
+          if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
-            vp9_rc_update_rate_correction_factors(cpi, 1);
+            vp9_rc_update_rate_correction_factors(cpi);
 
             q = (q_high + q_low + 1) / 2;
           } else {
             // Update rate_correction_factor unless
-            vp9_rc_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi);
 
             q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, MAX(q_high, top_index));
 
             while (q < q_low && retries < 10) {
-              vp9_rc_update_rate_correction_factors(cpi, 0);
+              vp9_rc_update_rate_correction_factors(cpi);
               q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, MAX(q_high, top_index));
               retries++;
@@ -2717,11 +3269,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
           // Frame is too small
           q_high = q > q_low ? q - 1 : q_low;
 
-          if (overshoot_seen || loop_count > 1) {
-            vp9_rc_update_rate_correction_factors(cpi, 1);
+          if (overshoot_seen || loop_at_this_size > 1) {
+            vp9_rc_update_rate_correction_factors(cpi);
             q = (q_high + q_low) / 2;
           } else {
-            vp9_rc_update_rate_correction_factors(cpi, 0);
+            vp9_rc_update_rate_correction_factors(cpi);
             q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                    bottom_index, top_index);
             // Special case reset for qlow for constrained quality.
@@ -2734,7 +3286,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
             }
 
             while (q > q_high && retries < 10) {
-              vp9_rc_update_rate_correction_factors(cpi, 0);
+              vp9_rc_update_rate_correction_factors(cpi);
               q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
                                      bottom_index, top_index);
               retries++;
@@ -2747,7 +3299,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
         // Clamp Q to upper and lower limits:
         q = clamp(q, q_low, q_high);
 
-        loop = q != last_q;
+        loop = (q != last_q);
       } else {
         loop = 0;
       }
@@ -2759,10 +3311,11 @@ static void encode_with_recode_loop(VP9_COMP *cpi,
       loop = 0;
 
     if (loop) {
-      loop_count++;
+      ++loop_count;
+      ++loop_at_this_size;
 
 #if CONFIG_INTERNAL_STATS
-      cpi->tot_recode_hits++;
+      ++cpi->tot_recode_hits;
 #endif
     }
   } while (loop);
@@ -2778,7 +3331,9 @@ static int get_ref_frame_flags(const VP9_COMP *cpi) {
   if (gold_is_last)
     flags &= ~VP9_GOLD_FLAG;
 
-  if (cpi->rc.frames_till_gf_update_due == INT_MAX && !is_two_pass_svc(cpi))
+  if (cpi->rc.frames_till_gf_update_due == INT_MAX &&
+      (cpi->svc.number_temporal_layers == 1 &&
+       cpi->svc.number_spatial_layers == 1))
     flags &= ~VP9_GOLD_FLAG;
 
   if (alt_is_last)
@@ -2823,25 +3378,6 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(VP9_COMMON *cm,
   }
 }
 
-static int is_skippable_frame(const VP9_COMP *cpi) {
-  // If the current frame does not have non-zero motion vector detected in the
-  // first  pass, and so do its previous and forward frames, then this frame
-  // can be skipped for partition check, and the partition size is assigned
-  // according to the variance
-  const SVC *const svc = &cpi->svc;
-  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
-      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
-
-  return (!frame_is_intra_only(&cpi->common) &&
-    twopass->stats_in - 2 > twopass->stats_in_start &&
-    twopass->stats_in < twopass->stats_in_end &&
-    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
-    == 1 &&
-    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
-    == 1 &&
-    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
 static void set_arf_sign_bias(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int arf_sign_bias;
@@ -2858,31 +3394,6 @@ static void set_arf_sign_bias(VP9_COMP *cpi) {
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 }
 
-static void set_mv_search_params(VP9_COMP *cpi) {
-  const VP9_COMMON *const cm = &cpi->common;
-  const unsigned int max_mv_def = MIN(cm->width, cm->height);
-
-  // Default based on max resolution.
-  cpi->mv_step_param = vp9_init_search_range(max_mv_def);
-
-  if (cpi->sf.mv.auto_mv_step_size) {
-    if (frame_is_intra_only(cm)) {
-      // Initialize max_mv_magnitude for use in the first INTER frame
-      // after a key/intra-only frame.
-      cpi->max_mv_magnitude = max_mv_def;
-    } else {
-      if (cm->show_frame)
-        // Allow mv_steps to correspond to twice the max mv magnitude found
-        // in the previous frame, capped by the default max_mv_magnitude based
-        // on resolution.
-        cpi->mv_step_param =
-            vp9_init_search_range(MIN(max_mv_def, 2 * cpi->max_mv_magnitude));
-      cpi->max_mv_magnitude = 0;
-    }
-  }
-}
-
-
 int setup_interp_filter_search_mask(VP9_COMP *cpi) {
   INTERP_FILTER ifilter;
   int ref_total[MAX_REF_FRAMES] = {0};
@@ -2917,43 +3428,21 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
-  int q;
-  int top_index;
-  int bottom_index;
 
   set_ext_overrides(cpi);
-
-  cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
-
-  if (cpi->unscaled_last_source != NULL)
-    cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
-                                             &cpi->scaled_last_source);
-
-  vp9_scale_references(cpi);
-
   vp9_clear_system_state();
 
-  // Enable or disable mode based tweaking of the zbin.
-  // For 2 pass only used where GF/ARF prediction quality
-  // is above a threshold.
-  cpi->zbin_mode_boost = 0;
-  cpi->zbin_mode_boost_enabled = 0;
-
   // Set the arf sign bias for this frame.
   set_arf_sign_bias(cpi);
 
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
-  set_mv_search_params(cpi);
-
   if (cpi->oxcf.pass == 2 &&
       cpi->sf.adaptive_interp_filter_search)
     cpi->sf.interp_filter_search_mask =
         setup_interp_filter_search_mask(cpi);
 
-
   // Set various flags etc to special state if it is a key frame.
   if (frame_is_intra_only(cm)) {
     // Reset the loop filter deltas and segmentation map.
@@ -2969,6 +3458,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     cpi->rc.source_alt_ref_active = 0;
 
     cm->error_resilient_mode = oxcf->error_resilient_mode;
+    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
 
     // By default, encoder assumes decoder can use prev_mi.
     if (cm->error_resilient_mode) {
@@ -2976,7 +3466,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
       cm->reset_frame_context = 0;
       cm->refresh_frame_context = 0;
     } else if (cm->intra_only) {
-      cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
       // Only reset the current context.
       cm->reset_frame_context = 2;
     }
@@ -3013,20 +3502,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     }
   }
 
-  // Configure experimental use of segmentation for enhanced coding of
-  // static regions if indicated.
-  // Only allowed in second pass of two pass (as requires lagged coding)
-  // and if the relevant speed feature flag is set.
-  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
-    configure_static_seg_features(cpi);
-
-  // Check if the current frame is skippable for the partition search in the
-  // second pass according to the first pass stats
-  if (cpi->sf.allow_partition_search_skip && oxcf->pass == 2 &&
-      (!cpi->use_svc || is_two_pass_svc(cpi))) {
-    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
-  }
-
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame.
   if (oxcf->pass == 0 &&
@@ -3041,57 +3516,15 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   vp9_clear_system_state();
 
-#if CONFIG_VP9_POSTPROC
-  if (oxcf->noise_sensitivity > 0) {
-    int l = 0;
-    switch (oxcf->noise_sensitivity) {
-      case 1:
-        l = 20;
-        break;
-      case 2:
-        l = 40;
-        break;
-      case 3:
-        l = 60;
-        break;
-      case 4:
-      case 5:
-        l = 100;
-        break;
-      case 6:
-        l = 150;
-        break;
-    }
-    vp9_denoise(cpi->Source, cpi->Source, l);
-  }
-#endif
-
 #if CONFIG_INTERNAL_STATS
-  {
-    int i;
-    for (i = 0; i < MAX_MODES; ++i)
-      cpi->mode_chosen_counts[i] = 0;
-  }
+  memset(cpi->mode_chosen_counts, 0,
+         MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
 
-  vp9_set_speed_features(cpi);
-
-  vp9_set_rd_speed_thresholds(cpi);
-  vp9_set_rd_speed_thresholds_sub8x8(cpi);
-
-  // Decide q and q bounds.
-  q = vp9_rc_pick_q_and_bounds(cpi, &bottom_index, &top_index);
-
-  if (!frame_is_intra_only(cm)) {
-    cm->interp_filter = cpi->sf.default_interp_filter;
-    /* TODO: Decide this more intelligently */
-    vp9_set_high_precision_mv(cpi, q < HIGH_PRECISION_MV_QTHRESH);
-  }
-
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi, q);
+    encode_without_recode_loop(cpi);
   } else {
-    encode_with_recode_loop(cpi, size, dest, q, bottom_index, top_index);
+    encode_with_recode_loop(cpi, size, dest);
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
@@ -3102,7 +3535,11 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 #endif
 #endif
-
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp9_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -3111,8 +3548,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
-                                              get_frame_new_buffer(cm),
-                                              cm->bit_depth);
+                                              get_frame_new_buffer(cm));
     } else {
       cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
     }
@@ -3136,11 +3572,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   if (cm->seg.update_map)
     update_reference_segmentation_map(cpi);
 
-  release_scaled_references(cpi);
+  if (frame_is_intra_only(cm) == 0) {
+    release_scaled_references(cpi);
+  }
   vp9_update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cm->counts.coef[t], cpi->coef_counts[t]);
+    full_to_model_counts(cpi->td.counts->coef[t],
+                         cpi->td.rd_counts.coef_counts[t]);
 
   if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
     vp9_adapt_coef_probs(cm);
@@ -3196,13 +3635,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
-
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
     if (cpi->use_svc)
       vp9_inc_frame_in_layer(cpi);
   }
+  cm->prev_frame = cm->cur_frame;
 
   if (is_two_pass_svc(cpi))
     cpi->svc.layer_context[cpi->svc.spatial_layer_id].last_frame_type =
@@ -3234,13 +3673,13 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size,
     vp9_twopass_postencode_update(cpi);
 }
 
-static void init_motion_estimation(VP9_COMP *cpi) {
-  int y_stride = cpi->scaled_source.y_stride;
-
-  if (cpi->sf.mv.search_method == NSTEP) {
-    vp9_init3smotion_compensation(&cpi->ss_cfg, y_stride);
-  } else if (cpi->sf.mv.search_method == DIAMOND) {
-    vp9_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+static void init_ref_frame_bufs(VP9_COMMON *cm) {
+  int i;
+  BufferPool *const pool = cm->buffer_pool;
+  cm->new_fb_idx = INVALID_IDX;
+  for (i = 0; i < REF_FRAMES; ++i) {
+    cm->ref_frame_map[i] = INVALID_IDX;
+    pool->frame_bufs[i].ref_count = 0;
   }
 }
 
@@ -3251,7 +3690,12 @@ static void check_initial_width(VP9_COMP *cpi,
                                 int subsampling_x, int subsampling_y) {
   VP9_COMMON *const cm = &cpi->common;
 
-  if (!cpi->initial_width) {
+  if (!cpi->initial_width ||
+#if CONFIG_VP9_HIGHBITDEPTH
+      cm->use_highbitdepth != use_highbitdepth ||
+#endif
+      cm->subsampling_x != subsampling_x ||
+      cm->subsampling_y != subsampling_y) {
     cm->subsampling_x = subsampling_x;
     cm->subsampling_y = subsampling_y;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -3259,16 +3703,31 @@ static void check_initial_width(VP9_COMP *cpi,
 #endif
 
     alloc_raw_frame_buffers(cpi);
-    alloc_ref_frame_buffers(cpi);
+    init_ref_frame_bufs(cm);
     alloc_util_frame_buffers(cpi);
 
-    init_motion_estimation(cpi);
+    init_motion_estimation(cpi);  // TODO(agrange) This can be removed.
 
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
+    cpi->initial_mbs = cm->MBs;
   }
 }
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+static void setup_denoiser_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  if (cpi->oxcf.noise_sensitivity > 0 &&
+      !cpi->denoiser.frame_buffer_initialized) {
+    vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height,
+                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       cm->use_highbitdepth,
+#endif
+                       VP9_ENC_BORDER_IN_PIXELS);
+  }
+}
+#endif
 
 int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -3285,9 +3744,16 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
   check_initial_width(cpi, subsampling_x, subsampling_y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
   vpx_usec_timer_start(&timer);
 
-  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags))
+  if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+#if CONFIG_VP9_HIGHBITDEPTH
+                         use_highbitdepth,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                         frame_flags))
     res = -1;
   vpx_usec_timer_mark(&timer);
   cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
@@ -3295,13 +3761,13 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags,
   if ((cm->profile == PROFILE_0 || cm->profile == PROFILE_2) &&
       (subsampling_x != 1 || subsampling_y != 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
-                       "Non-4:2:0 color space requires profile 1 or 3");
+                       "Non-4:2:0 color format requires profile 1 or 3");
     res = -1;
   }
   if ((cm->profile == PROFILE_1 || cm->profile == PROFILE_3) &&
       (subsampling_x == 1 && subsampling_y == 1)) {
     vpx_internal_error(&cm->error, VPX_CODEC_INVALID_PARAM,
-                       "4:2:0 color space requires profile 0 or 2");
+                       "4:2:0 color format requires profile 0 or 2");
     res = -1;
   }
 
@@ -3402,19 +3868,33 @@ static void check_src_altref(VP9_COMP *cpi,
   }
 }
 
+#if CONFIG_INTERNAL_STATS
+extern double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
+                                 const unsigned char *img2, int img2_pitch,
+                                 int width, int height);
+#endif
+
+void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) {
+  s->stat[Y] += y;
+  s->stat[U] += u;
+  s->stat[V] += v;
+  s->stat[ALL] += all;
+  s->worst = MIN(s->worst, all);
+}
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  BufferPool *const pool = cm->buffer_pool;
   RATE_CONTROL *const rc = &cpi->rc;
   struct vpx_usec_timer  cmptimer;
   YV12_BUFFER_CONFIG *force_src_buffer = NULL;
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
-  MV_REFERENCE_FRAME ref_frame;
   int arf_src_index;
+  int i;
 
   if (is_two_pass_svc(cpi)) {
 #if CONFIG_SPATIAL_SVC
@@ -3431,6 +3911,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   vp9_set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
+  // Is multi-arf enabled.
+  // Note that at the moment multi_arf is only configured for 2 pass VBR and
+  // will not work properly with svc.
+  if ((oxcf->pass == 2) && !cpi->use_svc &&
+      (cpi->oxcf.enable_auto_arf > 1))
+    cpi->multi_arf_allowed = 1;
+  else
+    cpi->multi_arf_allowed = 0;
+
   // Normal defaults
   cm->reset_frame_context = 0;
   cm->refresh_frame_context = 1;
@@ -3456,7 +3945,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         int i;
         // Reference a hidden frame from a lower layer
         for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
-          if (oxcf->ss_play_alternate[i]) {
+          if (oxcf->ss_enable_auto_arf[i]) {
             cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
             break;
           }
@@ -3500,6 +3989,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     if (source != NULL) {
       cm->show_frame = 1;
       cm->intra_only = 0;
+      // if the flags indicate intra frame, but if the current picture is for
+      // non-zero spatial layer, it should not be an intra picture.
+      // TODO(Won Kap): this needs to change if per-layer intra frame is
+      // allowed.
+      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->svc.spatial_layer_id) {
+        source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
+      }
 
       // Check to see if the frame should be encoded as an arf overlay.
       check_src_altref(cpi, source);
@@ -3544,23 +4040,17 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     vp9_restore_layer_context(cpi);
   }
 
-  // start with a 0 size frame
-  *size = 0;
-
-  /* find a free buffer for the new frame, releasing the reference previously
-   * held.
-   */
-  cm->frame_bufs[cm->new_fb_idx].ref_count--;
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+  if (cm->new_fb_idx != INVALID_IDX) {
+    --pool->frame_bufs[cm->new_fb_idx].ref_count;
+  }
   cm->new_fb_idx = get_free_fb(cm);
 
-  // For two pass encodes analyse the first pass stats and determine
-  // the bit allocation and other parameters for this frame / group of frames.
-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc ||
-       (is_two_pass_svc(cpi) &&
-        cpi->svc.encode_empty_frame_state != ENCODING))) {
-    vp9_rc_get_second_pass_params(cpi);
-  }
+  if (cm->new_fb_idx == INVALID_IDX)
+    return -1;
+
+  cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
   if (!cpi->use_svc && cpi->multi_arf_allowed) {
     if (cm->frame_type == KEY_FRAME) {
@@ -3571,70 +4061,38 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     }
   }
 
-  cpi->frame_flags = *frame_flags;
-
-  if (oxcf->pass == 2 &&
-      cm->current_video_frame == 0 &&
-      oxcf->allow_spatial_resampling &&
-      oxcf->rc_mode == VPX_VBR) {
-    // Internal scaling is triggered on the first frame.
-    vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
-                         oxcf->scaled_frame_height);
-  }
-
-  // Reset the frame pointers to the current frame size
-  vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
-                           cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                           cm->use_highbitdepth,
-#endif
-                           VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL);
+  // Start with a 0 size frame.
+  *size = 0;
 
-  alloc_util_frame_buffers(cpi);
-  init_motion_estimation(cpi);
+  cpi->frame_flags = *frame_flags;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const buf = &cm->frame_bufs[idx].buf;
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
-    ref_buf->buf = buf;
-    ref_buf->idx = idx;
-#if CONFIG_VP9_HIGHBITDEPTH
-    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
-                                      buf->y_crop_width, buf->y_crop_height,
-                                      cm->width, cm->height,
-                                      (buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
-                                          1 : 0);
-#else
-    vp9_setup_scale_factors_for_frame(&ref_buf->sf,
-                                      buf->y_crop_width, buf->y_crop_height,
-                                      cm->width, cm->height);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    if (vp9_is_scaled(&ref_buf->sf))
-      vp9_extend_frame_borders(buf);
+  if ((oxcf->pass == 2) &&
+      (!cpi->use_svc ||
+          (is_two_pass_svc(cpi) &&
+              cpi->svc.encode_empty_frame_state != ENCODING))) {
+    vp9_rc_get_second_pass_params(cpi);
+  } else {
+    set_frame_size(cpi);
   }
 
-  set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
-
-  if (oxcf->aq_mode == VARIANCE_AQ) {
-    vp9_vaq_init();
-  }
+  for (i = 0; i < MAX_REF_FRAMES; ++i)
+    cpi->scaled_ref_idx[i] = INVALID_IDX;
 
   if (oxcf->pass == 1 &&
       (!cpi->use_svc || is_two_pass_svc(cpi))) {
     const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.use_highbitdepth)
-      cpi->mb.fwd_txm4x4 = lossless ? vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
+      cpi->td.mb.fwd_txm4x4 = lossless ?
+          vp9_highbd_fwht4x4 : vp9_highbd_fdct4x4;
     else
-      cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
-    cpi->mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
+      cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->td.mb.highbd_itxm_add = lossless ? vp9_highbd_iwht4x4_add :
                                          vp9_highbd_idct4x4_add;
 #else
-    cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 &&
       (!cpi->use_svc || is_two_pass_svc(cpi))) {
@@ -3647,10 +4105,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   if (cm->refresh_frame_context)
-    cm->frame_contexts[cm->frame_context_idx] = cm->fc;
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 
-  // Frame was dropped, release scaled references.
-  if (*size == 0) {
+  // No frame encoded, or frame was dropped, release scaled references.
+  if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
     release_scaled_references(cpi);
   }
 
@@ -3676,6 +4134,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #if CONFIG_INTERNAL_STATS
 
   if (oxcf->pass != 1) {
+    double samples;
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
@@ -3687,47 +4146,55 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
         PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
-        calc_highbd_psnr(orig, recon, &psnr, cpi->mb.e_mbd.bd,
+        calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
                          cpi->oxcf.input_bit_depth);
 #else
         calc_psnr(orig, recon, &psnr);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-        cpi->total += psnr.psnr[0];
-        cpi->total_y += psnr.psnr[1];
-        cpi->total_u += psnr.psnr[2];
-        cpi->total_v += psnr.psnr[3];
+        adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
+                          psnr.psnr[0], &cpi->psnr);
         cpi->total_sq_error += psnr.sse[0];
         cpi->total_samples += psnr.samples[0];
+        samples = psnr.samples[0];
 
         {
           PSNR_STATS psnr2;
           double frame_ssim2 = 0, weight = 0;
 #if CONFIG_VP9_POSTPROC
-          // TODO(agrange) Add resizing of post-proc buffer in here when the
-          // encoder is changed to use on-demand buffer allocation.
+          if (vp9_alloc_frame_buffer(&cm->post_proc_buffer,
+                                     recon->y_crop_width, recon->y_crop_height,
+                                     cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                     cm->use_highbitdepth,
+#endif
+                                     VP9_ENC_BORDER_IN_PIXELS,
+                                     cm->byte_alignment) < 0) {
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate post processing buffer");
+          }
+
           vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
                       cm->lf.filter_level * 10 / 6);
 #endif
           vp9_clear_system_state();
 
 #if CONFIG_VP9_HIGHBITDEPTH
-          calc_highbd_psnr(orig, pp, &psnr, cpi->mb.e_mbd.bd,
+          calc_highbd_psnr(orig, pp, &psnr, cpi->td.mb.e_mbd.bd,
                            cpi->oxcf.input_bit_depth);
 #else
           calc_psnr(orig, pp, &psnr2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-          cpi->totalp += psnr2.psnr[0];
-          cpi->totalp_y += psnr2.psnr[1];
-          cpi->totalp_u += psnr2.psnr[2];
-          cpi->totalp_v += psnr2.psnr[3];
           cpi->totalp_sq_error += psnr2.sse[0];
           cpi->totalp_samples += psnr2.samples[0];
+          adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3],
+                            psnr2.psnr[0], &cpi->psnrp);
 
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cm->use_highbitdepth) {
-            frame_ssim2 = vp9_highbd_calc_ssim(orig, recon, &weight, xd->bd);
+            frame_ssim2 = vp9_highbd_calc_ssim(orig, recon, &weight,
+                                               (int)cm->bit_depth);
           } else {
             frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
           }
@@ -3735,13 +4202,14 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           frame_ssim2 = vp9_calc_ssim(orig, recon, &weight);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+          cpi->worst_ssim= MIN(cpi->worst_ssim, frame_ssim2);
           cpi->summed_quality += frame_ssim2 * weight;
           cpi->summed_weights += weight;
 
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cm->use_highbitdepth) {
             frame_ssim2 = vp9_highbd_calc_ssim(
-                orig, &cm->post_proc_buffer, &weight, xd->bd);
+                orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
           } else {
             frame_ssim2 = vp9_calc_ssim(orig, &cm->post_proc_buffer, &weight);
           }
@@ -3762,14 +4230,40 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #endif
         }
       }
+      if (cpi->b_calculate_blockiness) {
+        double frame_blockiness = vp9_get_blockiness(
+            cpi->Source->y_buffer, cpi->Source->y_stride,
+            cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+            cpi->Source->y_width, cpi->Source->y_height);
+        cpi->worst_blockiness = MAX(cpi->worst_blockiness, frame_blockiness);
+        cpi->total_blockiness += frame_blockiness;
+      }
 
+      if (cpi->b_calculate_consistency) {
+        double this_inconsistency = vp9_get_ssim_metrics(
+            cpi->Source->y_buffer, cpi->Source->y_stride,
+            cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
+            cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
+            &cpi->metrics, 1);
+
+        const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+
+
+        double consistency = vpx_sse_to_psnr(samples, peak,
+                                             (double)cpi->total_inconsistency);
+
+        if (consistency > 0.0)
+          cpi->worst_consistency = MIN(cpi->worst_consistency,
+                                       consistency);
+        cpi->total_inconsistency += this_inconsistency;
+      }
 
       if (cpi->b_calculate_ssimg) {
         double y, u, v, frame_all;
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
           frame_all = vp9_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
-                                            &u, &v, xd->bd);
+                                            &u, &v, (int)cm->bit_depth);
         } else {
           frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
                                      &v);
@@ -3777,10 +4271,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #else
         frame_all = vp9_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        cpi->total_ssimg_y += y;
-        cpi->total_ssimg_u += u;
-        cpi->total_ssimg_v += v;
-        cpi->total_ssimg_all += frame_all;
+        adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
+      }
+      {
+        double y, u, v, frame_all;
+        frame_all = vp9_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
+                                      &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+        /* TODO(JBB): add 10/12 bit support */
+      }
+      {
+        double y, u, v, frame_all;
+        frame_all = vp9_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
       }
     }
   }
@@ -3833,29 +4336,6 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
   }
 }
 
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols) {
-  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
-    const int mi_rows = cpi->common.mi_rows;
-    const int mi_cols = cpi->common.mi_cols;
-    if (map) {
-      int r, c;
-      for (r = 0; r < mi_rows; r++) {
-        for (c = 0; c < mi_cols; c++) {
-          cpi->segmentation_map[r * mi_cols + c] =
-              !map[(r >> 1) * cols + (c >> 1)];
-        }
-      }
-      vp9_enable_segfeature(&cpi->common.seg, 1, SEG_LVL_SKIP);
-      vp9_enable_segmentation(&cpi->common.seg);
-    } else {
-      vp9_disable_segmentation(&cpi->common.seg);
-    }
-    return 0;
-  } else {
-    return -1;
-  }
-}
-
 int vp9_set_internal_size(VP9_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
   VP9_COMMON *cm = &cpi->common;
@@ -3882,11 +4362,15 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
-  check_initial_width(cpi, 1, 1, cm->use_highbitdepth);
+  check_initial_width(cpi, cm->use_highbitdepth, 1, 1);
 #else
   check_initial_width(cpi, 1, 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  setup_denoiser_buffer(cpi);
+#endif
+
   if (width) {
     cm->width = width;
     if (cm->width > cpi->initial_width) {
@@ -3915,41 +4399,25 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc) {
   return;
 }
 
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b) {
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
 
-  return (int)get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                      a->y_crop_width, a->y_crop_height);
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                         const YV12_BUFFER_CONFIG *b,
-                         vpx_bit_depth_t bit_depth) {
-  unsigned int sse;
-  int sum;
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
   assert(a->y_crop_width == b->y_crop_width);
   assert(a->y_crop_height == b->y_crop_height);
   assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
   assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  switch (bit_depth) {
-    case VPX_BITS_8:
-      highbd_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                      a->y_crop_width, a->y_crop_height, &sse, &sum);
-      return (int) sse;
-    case VPX_BITS_10:
-      highbd_10_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                         a->y_crop_width, a->y_crop_height, &sse, &sum);
-      return (int) sse;
-    case VPX_BITS_12:
-      highbd_12_variance(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                         a->y_crop_width, a->y_crop_height, &sse, &sum);
-      return (int) sse;
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
-  }
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
index 2c56b81f360..41f1c13d493 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h
@@ -17,9 +17,12 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_ppflags.h"
 #include "vp9/common/vp9_entropymode.h"
+#include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_thread.h"
 
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_context_tree.h"
@@ -31,10 +34,14 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_ssim.h"
+#endif
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_variance.h"
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
 #endif
@@ -109,6 +116,11 @@ typedef enum {
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
 
+typedef enum {
+  RESIZE_NONE = 0,    // No frame resizing allowed (except for SVC).
+  RESIZE_FIXED = 1,   // All frames are coded at the specified dimension.
+  RESIZE_DYNAMIC = 2  // Coded size of each frame is determined by the codec.
+} RESIZE_TYPE;
 
 typedef struct VP9EncoderConfig {
   BITSTREAM_PROFILE profile;
@@ -122,7 +134,12 @@ typedef struct VP9EncoderConfig {
   int noise_sensitivity;  // pre processing blur: recommendation 0
   int sharpness;  // sharpening output: recommendation 0:
   int speed;
+  // maximum allowed bitrate for any intra frame in % of bitrate target.
   unsigned int rc_max_intra_bitrate_pct;
+  // maximum allowed bitrate for any inter frame in % of bitrate target.
+  unsigned int rc_max_inter_bitrate_pct;
+  // percent of rate boost for golden frame in CBR mode.
+  unsigned int gf_cbr_boost_pct;
 
   MODE mode;
   int pass;
@@ -159,7 +176,7 @@ typedef struct VP9EncoderConfig {
   AQ_MODE aq_mode;  // Adaptive Quantization mode
 
   // Internal frame size scaling.
-  int allow_spatial_resampling;
+  RESIZE_TYPE resize_mode;
   int scaled_frame_width;
   int scaled_frame_height;
 
@@ -178,13 +195,12 @@ typedef struct VP9EncoderConfig {
   int ts_number_layers;  // Number of temporal layers.
   // Bitrate allocation for spatial layers.
   int ss_target_bitrate[VPX_SS_MAX_LAYERS];
-  int ss_play_alternate[VPX_SS_MAX_LAYERS];
+  int ss_enable_auto_arf[VPX_SS_MAX_LAYERS];
   // Bitrate allocation (CBR mode) and framerate factor, for temporal layers.
   int ts_target_bitrate[VPX_TS_MAX_LAYERS];
   int ts_rate_decimator[VPX_TS_MAX_LAYERS];
 
-  // these parameters aren't to be used in final build don't use!!!
-  int play_alternate;
+  int enable_auto_arf;
 
   int encode_breakout;  // early breakout : for video conf recommend 800
 
@@ -206,6 +222,8 @@ typedef struct VP9EncoderConfig {
   int tile_columns;
   int tile_rows;
 
+  int max_threads;
+
   vpx_fixed_buf_t two_pass_stats_in;
   struct vpx_codec_pkt_list *output_pkt_list;
 
@@ -218,15 +236,62 @@ typedef struct VP9EncoderConfig {
 #if CONFIG_VP9_HIGHBITDEPTH
   int use_highbitdepth;
 #endif
+  vpx_color_space_t color_space;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+  TileInfo tile_info;
+  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
+  int mode_map[BLOCK_SIZES][MAX_MODES];
+} TileDataEnc;
+
+typedef struct RD_COUNTS {
+  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  int64_t comp_pred_diff[REFERENCE_MODES];
+  int64_t tx_select_diff[TX_MODES];
+  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+} RD_COUNTS;
+
+typedef struct ThreadData {
+  MACROBLOCK mb;
+  RD_COUNTS rd_counts;
+  FRAME_COUNTS *counts;
+
+  PICK_MODE_CONTEXT *leaf_tree;
+  PC_TREE *pc_tree;
+  PC_TREE *pc_root;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+  int enabled;
+  int update;
+  unsigned char *map;
+} ActiveMap;
+
+typedef enum {
+  Y,
+  U,
+  V,
+  ALL
+} STAT_TYPE;
+
+typedef struct IMAGE_STAT {
+  double stat[ALL+1];
+  double worst;
+} ImageStat;
+
 typedef struct VP9_COMP {
   QUANTS quants;
-  MACROBLOCK mb;
+  ThreadData td;
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
   VP9_COMMON common;
   VP9EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
@@ -239,10 +304,12 @@ typedef struct VP9_COMP {
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
+  TileDataEnc *tile_data;
+
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
-  int scaled_ref_idx[3];
+  int scaled_ref_idx[MAX_REF_FRAMES];
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
@@ -261,11 +328,11 @@ typedef struct VP9_COMP {
 
   YV12_BUFFER_CONFIG last_frame_uf;
 
-  TOKENEXTRA *tok;
+  TOKENEXTRA *tile_tok[4][1 << 6];
   unsigned int tok_count[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
-  int ambient_err;
+  int64_t ambient_err;
 
   RD_OPT rd;
 
@@ -276,9 +343,6 @@ typedef struct VP9_COMP {
   int *nmvsadcosts[2];
   int *nmvsadcosts_hp[2];
 
-  int zbin_mode_boost;
-  int zbin_mode_boost_enabled;
-
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
@@ -286,7 +350,6 @@ typedef struct VP9_COMP {
   RATE_CONTROL rc;
   double framerate;
 
-  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
 
   struct vpx_codec_pkt_list  *output_pkt_list;
@@ -301,6 +364,8 @@ typedef struct VP9_COMP {
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
+  int allow_comp_inter_inter;
+
   // Default value is 1. From first pass stats, encode_breakout may be disabled.
   ENCODE_BREAKOUT_TYPE allow_encode_breakout;
 
@@ -313,13 +378,11 @@ typedef struct VP9_COMP {
   // segment threashold for encode breakout
   int  segment_encode_breakout[MAX_SEGMENTS];
 
-  unsigned char *complexity_map;
-
   CYCLIC_REFRESH *cyclic_refresh;
+  ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
   vp9_full_search_fn_t full_search_sad;
-  vp9_refining_search_fn_t refining_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;
   vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
@@ -340,19 +403,16 @@ typedef struct VP9_COMP {
   unsigned int mode_chosen_counts[MAX_MODES];
 
   int    count;
-  double total_y;
-  double total_u;
-  double total_v;
-  double total;
   uint64_t total_sq_error;
   uint64_t total_samples;
+  ImageStat psnr;
 
-  double totalp_y;
-  double totalp_u;
-  double totalp_v;
-  double totalp;
   uint64_t totalp_sq_error;
   uint64_t totalp_samples;
+  ImageStat psnrp;
+
+  double total_blockiness;
+  double worst_blockiness;
 
   int    bytes;
   double summed_quality;
@@ -360,14 +420,21 @@ typedef struct VP9_COMP {
   double summedp_quality;
   double summedp_weights;
   unsigned int tot_recode_hits;
+  double worst_ssim;
 
-
-  double total_ssimg_y;
-  double total_ssimg_u;
-  double total_ssimg_v;
-  double total_ssimg_all;
+  ImageStat ssimg;
+  ImageStat fastssim;
+  ImageStat psnrhvs;
 
   int b_calculate_ssimg;
+  int b_calculate_blockiness;
+
+  int b_calculate_consistency;
+
+  double total_inconsistency;
+  double worst_consistency;
+  Ssimv *ssim_vars;
+  Metrics metrics;
 #endif
   int b_calculate_psnr;
 
@@ -375,6 +442,10 @@ typedef struct VP9_COMP {
 
   int initial_width;
   int initial_height;
+  int initial_mbs;  // Number of MBs in the full-size frame; to be used to
+                    // normalize the firstpass stats. This will differ from the
+                    // number of MBs in the current frame when the frame is
+                    // scaled.
 
   int use_svc;
 
@@ -395,10 +466,6 @@ typedef struct VP9_COMP {
   int intra_uv_mode_cost[FRAME_TYPES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
-  PICK_MODE_CONTEXT *leaf_tree;
-  PC_TREE *pc_tree;
-  PC_TREE *pc_root;
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
 
   int multi_arf_allowed;
@@ -408,11 +475,28 @@ typedef struct VP9_COMP {
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
+
+  int resize_pending;
+
+  // VAR_BASED_PARTITION thresholds
+  // 0 - threshold_64x64; 1 - threshold_32x32;
+  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+  int64_t vbp_thresholds[4];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
+  BLOCK_SIZE vbp_bsize_min;
+
+  // Multi-threading
+  int num_workers;
+  VP9Worker *workers;
+  struct EncWorkerData *tile_thr_data;
+  VP9LfSync lf_row_sync;
 } VP9_COMP;
 
-void vp9_initialize_enc();
+void vp9_initialize_enc(void);
 
-struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf);
+struct VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
+                                       BufferPool *const pool);
 void vp9_remove_compressor(VP9_COMP *cpi);
 
 void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
@@ -444,6 +528,8 @@ int vp9_update_entropy(VP9_COMP *cpi, int update);
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
 
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
 int vp9_set_internal_size(VP9_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
 
@@ -454,8 +540,14 @@ void vp9_set_svc(VP9_COMP *cpi, int use_svc);
 
 int vp9_get_quantizer(struct VP9_COMP *cpi);
 
-static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
-                                    MV_REFERENCE_FRAME ref_frame) {
+static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
+  return frame_is_intra_only(&cpi->common) ||
+         cpi->refresh_alt_ref_frame ||
+         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
+                                        MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
     return cpi->lst_fb_idx;
   } else if (ref_frame == GOLDEN_FRAME) {
@@ -465,11 +557,19 @@ static INLINE int get_ref_frame_idx(const VP9_COMP *cpi,
   }
 }
 
+static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
+                                        int ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+  return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
     VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON * const cm = &cpi->common;
-  return &cm->frame_bufs[cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)]]
-      .buf;
+  VP9_COMMON *const cm = &cpi->common;
+  const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
 }
 
 static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
@@ -490,11 +590,10 @@ static INLINE int allocated_tokens(TileInfo tile) {
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
-int vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
-int vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                         const YV12_BUFFER_CONFIG *b,
-                         vpx_bit_depth_t bit_depth);
+int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp9_alloc_compressor_data(VP9_COMP *cpi);
@@ -513,16 +612,15 @@ void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
 static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
   return cpi->use_svc &&
-         (cpi->svc.number_temporal_layers > 1 ||
-          cpi->svc.number_spatial_layers > 1) &&
-         (cpi->oxcf.pass == 1 || cpi->oxcf.pass == 2);
+         ((cpi->svc.number_spatial_layers > 1) ||
+         (cpi->svc.number_temporal_layers > 1 && cpi->oxcf.pass != 0));
 }
 
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 &&
-         (cpi->oxcf.play_alternate &&
+         (cpi->oxcf.enable_auto_arf &&
           (!is_two_pass_svc(cpi) ||
-           cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]));
+           cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
 }
 
 static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
@@ -542,6 +640,8 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
new file mode 100644
index 00000000000..8700ccdaecd
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c
@@ -0,0 +1,179 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+  int i, j, k, l, m, n;
+
+  for (i = 0; i < REFERENCE_MODES; i++)
+    td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
+
+  for (i = 0; i < TX_MODES; i++)
+    td->rd_counts.tx_select_diff[i] += td_t->rd_counts.tx_select_diff[i];
+
+  for (i = 0; i < TX_SIZES; i++)
+    for (j = 0; j < PLANE_TYPES; j++)
+      for (k = 0; k < REF_TYPES; k++)
+        for (l = 0; l < COEF_BANDS; l++)
+          for (m = 0; m < COEFF_CONTEXTS; m++)
+            for (n = 0; n < ENTROPY_TOKENS; n++)
+              td->rd_counts.coef_counts[i][j][k][l][m][n] +=
+                  td_t->rd_counts.coef_counts[i][j][k][l][m][n];
+}
+
+static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int t;
+
+  (void) unused;
+
+  for (t = thread_data->start; t < tile_rows * tile_cols;
+      t += cpi->num_workers) {
+    int tile_row = t / tile_cols;
+    int tile_col = t % tile_cols;
+
+    vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+  }
+
+  return 0;
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  vp9_init_tile_data(cpi);
+
+  // Only run once to create threads and allocate thread data.
+  if (cpi->num_workers == 0) {
+    CHECK_MEM_ERROR(cm, cpi->workers,
+                    vpx_malloc(num_workers * sizeof(*cpi->workers)));
+
+    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                    vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+    for (i = 0; i < num_workers; i++) {
+      VP9Worker *const worker = &cpi->workers[i];
+      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+      ++cpi->num_workers;
+      winterface->init(worker);
+
+      if (i < num_workers - 1) {
+        thread_data->cpi = cpi;
+
+        // Allocate thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td,
+                        vpx_memalign(32, sizeof(*thread_data->td)));
+        vp9_zero(*thread_data->td);
+
+        // Set up pc_tree.
+        thread_data->td->leaf_tree = NULL;
+        thread_data->td->pc_tree = NULL;
+        vp9_setup_pc_tree(cm, thread_data->td);
+
+        // Allocate frame counters in thread data.
+        CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+        // Create threads
+        if (!winterface->reset(worker))
+          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                             "Tile encoder thread creation failed");
+      } else {
+        // Main thread acts as a worker and uses the thread data in cpi.
+        thread_data->cpi = cpi;
+        thread_data->td = &cpi->td;
+      }
+
+      winterface->sync(worker);
+    }
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data;
+
+    worker->hook = (VP9WorkerHook)enc_worker_hook;
+    worker->data1 = &cpi->tile_thr_data[i];
+    worker->data2 = NULL;
+    thread_data = (EncWorkerData*)worker->data1;
+
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
+      thread_data->td->rd_counts = cpi->td.rd_counts;
+    }
+    if (thread_data->td->counts != &cpi->common.counts) {
+      memcpy(thread_data->td->counts, &cpi->common.counts,
+             sizeof(cpi->common.counts));
+    }
+
+    // Handle use_nonrd_pick_mode case.
+    if (cpi->sf.use_nonrd_pick_mode) {
+      MACROBLOCK *const x = &thread_data->td->mb;
+      MACROBLOCKD *const xd = &x->e_mbd;
+      struct macroblock_plane *const p = x->plane;
+      struct macroblockd_plane *const pd = xd->plane;
+      PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none;
+      int j;
+
+      for (j = 0; j < MAX_MB_PLANE; ++j) {
+        p[j].coeff = ctx->coeff_pbuf[j][0];
+        p[j].qcoeff = ctx->qcoeff_pbuf[j][0];
+        pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0];
+        p[j].eobs = ctx->eobs_pbuf[j][0];
+      }
+    }
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+
+  for (i = 0; i < num_workers; i++) {
+    VP9Worker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData*)worker->data1;
+
+    // Accumulate counters.
+    if (i < num_workers - 1) {
+      vp9_accumulate_frame_counts(cm, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.h
new file mode 100644
index 00000000000..e87c50bc712
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_ETHREAD_H_
+#define VP9_ENCODER_VP9_ETHREAD_H_
+
+struct VP9_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+  struct VP9_COMP *cpi;
+  struct ThreadData *td;
+  int start;
+} EncWorkerData;
+
+void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
+
+#endif  // VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
index c9b2131426f..96f3598b1dc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_extend.c
@@ -27,9 +27,9 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
   uint8_t *dst_ptr2 = dst + w;
 
   for (i = 0; i < h; i++) {
-    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
-    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w);
-    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_pitch;
     src_ptr2 += src_pitch;
     dst_ptr1 += dst_pitch;
@@ -45,12 +45,12 @@ static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
   linesize = extend_left + extend_right + w;
 
   for (i = 0; i < extend_top; i++) {
-    vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+    memcpy(dst_ptr1, src_ptr1, linesize);
     dst_ptr1 += dst_pitch;
   }
 
   for (i = 0; i < extend_bottom; i++) {
-    vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+    memcpy(dst_ptr2, src_ptr2, linesize);
     dst_ptr2 += dst_pitch;
   }
 }
@@ -73,7 +73,7 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
 
   for (i = 0; i < h; i++) {
     vpx_memset16(dst_ptr1, src_ptr1[0], extend_left);
-    vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));
+    memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(uint16_t));
     vpx_memset16(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_pitch;
     src_ptr2 += src_pitch;
@@ -90,12 +90,12 @@ static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
   linesize = extend_left + extend_right + w;
 
   for (i = 0; i < extend_top; i++) {
-    vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
     dst_ptr1 += dst_pitch;
   }
 
   for (i = 0; i < extend_bottom; i++) {
-    vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
     dst_ptr2 += dst_pitch;
   }
 }
@@ -110,9 +110,9 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   // Motion estimation may use src block variance with the block size up
   // to 64x64, so the right and bottom need to be extended to 64 multiple
   // or up to 16, whichever is greater.
-  const int eb_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
+  const int er_y = MAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6))
       - src->y_crop_width;
-  const int er_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
+  const int eb_y = MAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6))
       - src->y_crop_height;
   const int uv_width_subsampling = (src->uv_width != src->y_width);
   const int uv_height_subsampling = (src->uv_height != src->y_height);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_fastssim.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_fastssim.c
new file mode 100644
index 00000000000..f1d408cbe7b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_fastssim.c
@@ -0,0 +1,465 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This code was originally written by: Nathan E. Egge, at the Daala
+ *  project.
+ */
+#include <math.h>
+#include <string.h>
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/vp9_ssim.h"
+/* TODO(jbb): High bit depth version of this code needed */
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+  uint16_t *im1;
+  uint16_t *im2;
+  double *ssim;
+  int w;
+  int h;
+};
+
+struct fs_ctx {
+  fs_level *level;
+  int nlevels;
+  unsigned *col_buf;
+};
+
+static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+  unsigned char *data;
+  size_t data_size;
+  int lw;
+  int lh;
+  int l;
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  data_size = _nlevels * sizeof(fs_level)
+      + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    im_size = lw * (size_t) lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size += im_size;
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    data_size += level_size;
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  data = (unsigned char *) malloc(data_size);
+  _ctx->level = (fs_level *) data;
+  _ctx->nlevels = _nlevels;
+  data += _nlevels * sizeof(*_ctx->level);
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    _ctx->level[l].w = lw;
+    _ctx->level[l].h = lh;
+    im_size = lw * (size_t) lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    _ctx->level[l].im1 = (uint16_t *) data;
+    _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+    data += level_size;
+    _ctx->level[l].ssim = (double *) data;
+    data += im_size * sizeof(*_ctx->level[l].ssim);
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  _ctx->col_buf = (unsigned *) data;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) {
+  free(_ctx->level);
+}
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+  const uint16_t *src1;
+  const uint16_t *src2;
+  uint16_t *dst1;
+  uint16_t *dst2;
+  int w2;
+  int h2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  dst1 = _ctx->level[_l].im1;
+  dst2 = _ctx->level[_l].im2;
+  w2 = _ctx->level[_l - 1].w;
+  h2 = _ctx->level[_l - 1].h;
+  src1 = _ctx->level[_l - 1].im1;
+  src2 = _ctx->level[_l - 1].im2;
+  for (j = 0; j < h; j++) {
+    int j0offs;
+    int j1offs;
+    j0offs = 2 * j * w2;
+    j1offs = FS_MINI(2 * j + 1, h2) * w2;
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, w2);
+      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1]
+          + src1[j1offs + i0] + src1[j1offs + i1];
+      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1]
+          + src2[j1offs + i0] + src2[j1offs + i1];
+    }
+  }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const unsigned char *_src1,
+                                 int _s1ystride, const unsigned char *_src2,
+                                 int _s2ystride, int _w, int _h) {
+  uint16_t *dst1;
+  uint16_t *dst2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[0].w;
+  h = _ctx->level[0].h;
+  dst1 = _ctx->level[0].im1;
+  dst2 = _ctx->level[0].im2;
+  for (j = 0; j < h; j++) {
+    int j0;
+    int j1;
+    j0 = 2 * j;
+    j1 = FS_MINI(j0 + 1, _h);
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, _w);
+      dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
+          + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
+          + _src1[j1 * _s1ystride + i1];
+      dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
+          + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
+          + _src2[j1 * _s2ystride + i1];
+    }
+  }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l) {
+  unsigned *col_sums_x;
+  unsigned *col_sums_y;
+  uint16_t *im1;
+  uint16_t *im2;
+  double *ssim;
+  double c1;
+  int w;
+  int h;
+  int j0offs;
+  int j1offs;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  col_sums_x = _ctx->col_buf;
+  col_sums_y = col_sums_x + w;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  for (i = 0; i < w; i++)
+    col_sums_x[i] = 5 * im1[i];
+  for (i = 0; i < w; i++)
+    col_sums_y[i] = 5 * im2[i];
+  for (j = 1; j < 4; j++) {
+    j1offs = FS_MINI(j, h - 1) * w;
+    for (i = 0; i < w; i++)
+      col_sums_x[i] += im1[j1offs + i];
+    for (i = 0; i < w; i++)
+      col_sums_y[i] += im2[j1offs + i];
+  }
+  ssim = _ctx->level[_l].ssim;
+  c1 = (double) (SSIM_C1 * 4096 * (1 << 4 * _l));
+  for (j = 0; j < h; j++) {
+    unsigned mux;
+    unsigned muy;
+    int i0;
+    int i1;
+    mux = 5 * col_sums_x[0];
+    muy = 5 * col_sums_y[0];
+    for (i = 1; i < 4; i++) {
+      i1 = FS_MINI(i, w - 1);
+      mux += col_sums_x[i1];
+      muy += col_sums_y[i1];
+    }
+    for (i = 0; i < w; i++) {
+      ssim[j * w + i] *= (2 * mux * (double) muy + c1)
+          / (mux * (double) mux + muy * (double) muy + c1);
+      if (i + 1 < w) {
+        i0 = FS_MAXI(0, i - 4);
+        i1 = FS_MINI(i + 4, w - 1);
+        mux += col_sums_x[i1] - col_sums_x[i0];
+        muy += col_sums_x[i1] - col_sums_x[i0];
+      }
+    }
+    if (j + 1 < h) {
+      j0offs = FS_MAXI(0, j - 4) * w;
+      for (i = 0; i < w; i++)
+        col_sums_x[i] -= im1[j0offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_y[i] -= im2[j0offs + i];
+      j1offs = FS_MINI(j + 4, h - 1) * w;
+      for (i = 0; i < w; i++)
+        col_sums_x[i] += im1[j1offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_y[i] += im2[j1offs + i];
+    }
+  }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] = gx * (double)gx; \
+    col_sums_gy2[(_col)] = gy * (double)gy; \
+    col_sums_gxgy[(_col)] = gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] += gx * (double)gx; \
+    col_sums_gy2[(_col)] += gy * (double)gy; \
+    col_sums_gxgy[(_col)] += gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs) \
+  do { \
+    unsigned gx; \
+    unsigned gy; \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] -= gx * (double)gx; \
+    col_sums_gy2[(_col)] -= gy * (double)gy; \
+    col_sums_gxgy[(_col)] -= gx * (double)gy; \
+  } \
+  while (0)
+
+#define FS_COL_COPY(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+  } \
+  while (0)
+
+#define FS_COL_HALVE(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+  } \
+  while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2) \
+  do { \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+  } \
+  while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l) {
+  uint16_t *im1;
+  uint16_t *im2;
+  unsigned *gx_buf;
+  unsigned *gy_buf;
+  double *ssim;
+  double col_sums_gx2[8];
+  double col_sums_gy2[8];
+  double col_sums_gxgy[8];
+  double c2;
+  int stride;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  ssim = _ctx->level[_l].ssim;
+  gx_buf = _ctx->col_buf;
+  stride = w + 8;
+  gy_buf = gx_buf + 8 * stride;
+  memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+  c2 = SSIM_C2 * (1 << 4 * _l) * 16 * 104;
+  for (j = 0; j < h + 4; j++) {
+    if (j < h - 1) {
+      for (i = 0; i < w - 1; i++) {
+        unsigned g1;
+        unsigned g2;
+        unsigned gx;
+        unsigned gy;
+        g1 = abs(im1[(j + 1) * w + i + 1] - im1[j * w + i]);
+        g2 = abs(im1[(j + 1) * w + i] - im1[j * w + i + 1]);
+        gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        g1 = abs(im2[(j + 1) * w + i + 1] - im2[j * w + i]);
+        g2 = abs(im2[(j + 1) * w + i] - im2[j * w + i + 1]);
+        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        gx_buf[(j & 7) * stride + i + 4] = gx;
+        gy_buf[(j & 7) * stride + i + 4] = gy;
+      }
+    } else {
+      memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+      memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+    }
+    if (j >= 4) {
+      int k;
+      col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+      col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+      col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+          col_sums_gxgy[0] = 0;
+      for (i = 4; i < 8; i++) {
+        FS_COL_SET(i, -1, 0);
+        FS_COL_ADD(i, 0, 0);
+        for (k = 1; k < 8 - i; k++) {
+          FS_COL_DOUBLE(i, i);
+          FS_COL_ADD(i, -k - 1, 0);
+          FS_COL_ADD(i, k, 0);
+        }
+      }
+      for (i = 0; i < w; i++) {
+        double mugx2;
+        double mugy2;
+        double mugxgy;
+        mugx2 = col_sums_gx2[0];
+        for (k = 1; k < 8; k++)
+          mugx2 += col_sums_gx2[k];
+        mugy2 = col_sums_gy2[0];
+        for (k = 1; k < 8; k++)
+          mugy2 += col_sums_gy2[k];
+        mugxgy = col_sums_gxgy[0];
+        for (k = 1; k < 8; k++)
+          mugxgy += col_sums_gxgy[k];
+        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+        if (i + 1 < w) {
+          FS_COL_SET(0, -1, 1);
+          FS_COL_ADD(0, 0, 1);
+          FS_COL_SUB(2, -3, 2);
+          FS_COL_SUB(2, 2, 2);
+          FS_COL_HALVE(1, 2);
+          FS_COL_SUB(3, -4, 3);
+          FS_COL_SUB(3, 3, 3);
+          FS_COL_HALVE(2, 3);
+          FS_COL_COPY(3, 4);
+          FS_COL_DOUBLE(4, 5);
+          FS_COL_ADD(4, -4, 5);
+          FS_COL_ADD(4, 3, 5);
+          FS_COL_DOUBLE(5, 6);
+          FS_COL_ADD(5, -3, 6);
+          FS_COL_ADD(5, 2, 6);
+          FS_COL_DOUBLE(6, 7);
+          FS_COL_ADD(6, -2, 7);
+          FS_COL_ADD(6, 1, 7);
+          FS_COL_SET(7, -1, 8);
+          FS_COL_ADD(7, 0, 8);
+        }
+      }
+    }
+  }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {0.2989654541015625,
+    0.3141326904296875, 0.2473602294921875, 0.1395416259765625};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+  double *ssim;
+  double ret;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  ssim = _ctx->level[_l].ssim;
+  ret = 0;
+  for (j = 0; j < h; j++)
+    for (i = 0; i < w; i++)
+      ret += ssim[j * w + i];
+  return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double calc_ssim(const unsigned char *_src, int _systride,
+                 const unsigned char *_dst, int _dystride, int _w, int _h) {
+  fs_ctx ctx;
+  double ret;
+  int l;
+  ret = 1;
+  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h);
+  for (l = 0; l < FS_NLEVELS - 1; l++) {
+    fs_calc_structure(&ctx, l);
+    ret *= fs_average(&ctx, l);
+    fs_downsample_level(&ctx, l + 1);
+  }
+  fs_calc_structure(&ctx, l);
+  fs_apply_luminance(&ctx, l);
+  ret *= fs_average(&ctx, l);
+  fs_ctx_clear(&ctx);
+  return ret;
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+  return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+double vp9_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                         double *ssim_y, double *ssim_u, double *ssim_v) {
+  double ssimv;
+  vp9_clear_system_state();
+
+  *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+                      dest->y_stride, source->y_crop_width,
+                      source->y_crop_height);
+
+  *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height);
+
+  *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height);
+  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+
+  return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
index f1baf8323af..9752668b15d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -38,6 +38,8 @@
 #define OUTPUT_FPF          0
 #define ARF_STATS_OUTPUT    0
 
+#define GROUP_ADAPTIVE_MAXQ 1
+
 #define BOOST_BREAKOUT      12.5
 #define BOOST_FACTOR        12.5
 #define ERR_DIVISOR         128.0
@@ -49,10 +51,18 @@
 #define KF_MAX_BOOST        128.0
 #define MIN_ARF_GF_BOOST    240
 #define MIN_DECAY_FACTOR    0.01
-#define MIN_GF_INTERVAL     4
 #define MIN_KF_BOOST        300
 #define NEW_MV_MODE_PENALTY 32
 #define SVC_FACTOR_PT_LOW   0.45
+#define DARK_THRESH         64
+#define DEFAULT_GRP_WEIGHT  1.0
+#define RC_FACTOR_MIN       0.75
+#define RC_FACTOR_MAX       1.75
+
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
 
 #define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x) - 0.000001 : (x) + 0.000001)
 
@@ -60,12 +70,6 @@
 unsigned int arf_count = 0;
 #endif
 
-static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) {
-  YV12_BUFFER_CONFIG temp = *a;
-  *a = *b;
-  *b = temp;
-}
-
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
 static void reset_fpf_position(TWO_PASS *p,
@@ -106,10 +110,11 @@ static void output_stats(FIRSTPASS_STATS *stats,
     FILE *fpfile;
     fpfile = fopen("firstpass.stt", "a");
 
-    fprintf(fpfile, "%12.0f %12.0f %12.0f %12.0f %12.4f %12.4f"
-            "%12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f"
-            "%12.0f %12.0f %12.4f %12.0f %12.0f %12.4f\n",
+    fprintf(fpfile, "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+            "%12.4lf %12.0lf %12.0lf %12.0lf %12.4lf\n",
             stats->frame,
+            stats->weight,
             stats->intra_error,
             stats->coded_error,
             stats->sr_coded_error,
@@ -138,13 +143,14 @@ static void output_fpmb_stats(uint8_t *this_frame_mb_stats, VP9_COMMON *cm,
   struct vpx_codec_cx_pkt pkt;
   pkt.kind = VPX_CODEC_FPMB_STATS_PKT;
   pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
-  pkt.data.firstpass_mb_stats.sz = cm->MBs * sizeof(uint8_t);
+  pkt.data.firstpass_mb_stats.sz = cm->initial_mbs * sizeof(uint8_t);
   vpx_codec_pkt_list_add(pktlist, &pkt);
 }
 #endif
 
 static void zero_stats(FIRSTPASS_STATS *section) {
-  section->frame      = 0.0;
+  section->frame = 0.0;
+  section->weight = 0.0;
   section->intra_error = 0.0;
   section->coded_error = 0.0;
   section->sr_coded_error = 0.0;
@@ -168,6 +174,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
 static void accumulate_stats(FIRSTPASS_STATS *section,
                              const FIRSTPASS_STATS *frame) {
   section->frame += frame->frame;
+  section->weight += frame->weight;
   section->spatial_layer_id = frame->spatial_layer_id;
   section->intra_error += frame->intra_error;
   section->coded_error += frame->coded_error;
@@ -191,6 +198,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
 static void subtract_stats(FIRSTPASS_STATS *section,
                            const FIRSTPASS_STATS *frame) {
   section->frame -= frame->frame;
+  section->weight -= frame->weight;
   section->intra_error -= frame->intra_error;
   section->coded_error -= frame->coded_error;
   section->sr_coded_error -= frame->sr_coded_error;
@@ -217,10 +225,11 @@ static double calculate_modified_err(const TWO_PASS *twopass,
                                      const VP9EncoderConfig *oxcf,
                                      const FIRSTPASS_STATS *this_frame) {
   const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_err = stats->coded_error / stats->count;
-  const double modified_error = av_err *
-      pow(this_frame->coded_error / DOUBLE_DIVIDE_CHECK(av_err),
-          oxcf->two_pass_vbrbias / 100.0);
+  const double av_weight = stats->weight / stats->count;
+  const double av_err = (stats->coded_error * av_weight) / stats->count;
+  const double modified_error =
+    av_err * pow(this_frame->coded_error * this_frame->weight /
+                 DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0);
   return fclamp(modified_error,
                 twopass->modified_error_min, twopass->modified_error_max);
 }
@@ -332,9 +341,9 @@ static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
 
 // Refine the motion search range according to the frame dimension
 // for first pass test.
-static int get_search_range(const VP9_COMMON *cm) {
+static int get_search_range(const VP9_COMP *cpi) {
   int sr = 0;
-  const int dim = MIN(cm->width, cm->height);
+  const int dim = MIN(cpi->initial_width, cpi->initial_height);
 
   while ((dim << sr) < MAX_FULL_PEL_VAL)
     ++sr;
@@ -348,13 +357,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   MV tmp_mv = {0, 0};
   MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
   int num00, tmp_err, n;
-  const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
-  const int sr = get_search_range(&cpi->common);
+  const int sr = get_search_range(cpi);
   step_param += sr;
   further_steps -= sr;
 
@@ -444,22 +453,16 @@ static void set_first_pass_params(VP9_COMP *cpi) {
 
 void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int mb_row, mb_col;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->pc_root->none;
+  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
   int i;
 
   int recon_yoffset, recon_uvoffset;
-  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
-  int recon_y_stride = lst_yv12->y_stride;
-  int recon_uv_stride = lst_yv12->uv_stride;
-  int uv_mb_height = 16 >> (lst_yv12->y_height > lst_yv12->uv_height);
   int64_t intra_error = 0;
   int64_t coded_error = 0;
   int64_t sr_coded_error = 0;
@@ -471,24 +474,41 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   int intercount = 0;
   int second_ref_count = 0;
   const int intrapenalty = INTRA_MODE_PENALTY;
-  int neutral_count = 0;
+  double neutral_count;
   int new_mv_count = 0;
   int sum_in_vectors = 0;
   MV lastmv = {0, 0};
   TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = {0, 0};
+  int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
   LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
         &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL;
+  double intra_factor;
+  double brightness_factor;
+  BufferPool *const pool = cm->buffer_pool;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->MBs);
+    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
   }
 #endif
 
   vp9_clear_system_state();
 
+  intra_factor = 0.0;
+  brightness_factor = 0.0;
+  neutral_count = 0.0;
+
   set_first_pass_params(cpi);
   vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
 
@@ -521,20 +541,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     }
 
     if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      const int ref_idx =
-          cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)];
-      const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1];
-
-      gld_yv12 = (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf :
-                 get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      if (gld_yv12 == NULL) {
+        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      }
     } else {
       gld_yv12 = NULL;
     }
 
-    recon_y_stride = new_yv12->y_stride;
-    recon_uv_stride = new_yv12->uv_stride;
-    uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
-
     set_ref_ptrs(cm, xd,
                  (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
                  (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
@@ -546,11 +560,14 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
-  vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
   vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
 
-  xd->mi = cm->mi;
-  xd->mi[0].src_mi = &xd->mi[0];
+  if (!frame_is_intra_only(cm)) {
+    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
 
   vp9_frame_init_quantizer(cpi);
 
@@ -568,6 +585,10 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   // Tiling is ignored in the first pass.
   vp9_tile_init(&tile, cm, 0, 0);
 
+  recon_y_stride = new_yv12->y_stride;
+  recon_uv_stride = new_yv12->uv_stride;
+  uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
   for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
     MV best_ref_mv = {0, 0};
 
@@ -585,8 +606,10 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
       int this_error;
       const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      double error_weight = 1.0;
       const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+      double log_intra;
+      int level_sample;
+
 #if CONFIG_FP_MB_STATS
       const int mb_index = mb_row * cm->mb_cols + mb_col;
 #endif
@@ -597,22 +620,17 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
       xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
       xd->left_available = (mb_col != 0);
-      xd->mi[0].src_mi->mbmi.sb_type = bsize;
-      xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+      xd->mi[0]->mbmi.sb_type = bsize;
+      xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile,
                      mb_row << 1, num_8x8_blocks_high_lookup[bsize],
                      mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
                      cm->mi_rows, cm->mi_cols);
 
-      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        const int energy = vp9_block_energy(cpi, x, bsize);
-        error_weight = vp9_vaq_inv_q_ratio(energy);
-      }
-
       // Do intra 16x16 prediction.
       x->skip_encode = 0;
-      xd->mi[0].src_mi->mbmi.mode = DC_PRED;
-      xd->mi[0].src_mi->mbmi.tx_size = use_dc_pred ?
+      xd->mi[0]->mbmi.mode = DC_PRED;
+      xd->mi[0]->mbmi.tx_size = use_dc_pred ?
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
       vp9_encode_intra_block_plane(x, bsize, 0);
       this_error = vp9_get_mb_ss(x->plane[0].src_diff);
@@ -635,10 +653,25 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-        vp9_clear_system_state();
-        this_error = (int)(this_error * error_weight);
-      }
+      vp9_clear_system_state();
+      log_intra = log(this_error + 1.0);
+      if (log_intra < 10.0)
+        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+      else
+        intra_factor += 1.0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+      else
+        level_sample = x->plane[0].src.buf[0];
+#else
+      level_sample = x->plane[0].src.buf[0];
+#endif
+      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+      else
+        brightness_factor += 1.0;
 
       // Intrapenalty below deals with situations where the intra and inter
       // error scores are very low (e.g. a plain black frame).
@@ -711,20 +744,12 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
           // Test last reference frame using the previous best mv as the
           // starting point (best reference) for the search.
           first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
-          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-            vp9_clear_system_state();
-            motion_error = (int)(motion_error * error_weight);
-          }
 
           // If the current best reference mv is not centered on 0,0 then do a
           // 0,0 based search as well.
           if (!is_zero_mv(&best_ref_mv)) {
             tmp_err = INT_MAX;
             first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
-            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-              vp9_clear_system_state();
-              tmp_err = (int)(tmp_err * error_weight);
-            }
 
             if (tmp_err < motion_error) {
               motion_error = tmp_err;
@@ -755,10 +780,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
             first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
                                      &gf_motion_error);
-            if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
-              vp9_clear_system_state();
-              gf_motion_error = (int)(gf_motion_error * error_weight);
-            }
 
             if (gf_motion_error < motion_error && gf_motion_error < this_error)
               ++second_ref_count;
@@ -802,21 +823,30 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 #endif
 
         if (motion_error <= this_error) {
+          vp9_clear_system_state();
+
           // Keep a count of cases where the inter and intra were very close
           // and very low. This helps with scene cut detection for example in
           // cropped clips with black bars at the sides or top and bottom.
           if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
-              this_error < 2 * intrapenalty)
-            ++neutral_count;
+              (this_error < (2 * intrapenalty))) {
+            neutral_count += 1.0;
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+            neutral_count += (double)motion_error /
+                             DOUBLE_DIVIDE_CHECK((double)this_error);
+          }
 
           mv.row *= 8;
           mv.col *= 8;
           this_error = motion_error;
-          xd->mi[0].src_mi->mbmi.mode = NEWMV;
-          xd->mi[0].src_mi->mbmi.mv[0].as_mv = mv;
-          xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
-          xd->mi[0].src_mi->mbmi.ref_frame[0] = LAST_FRAME;
-          xd->mi[0].src_mi->mbmi.ref_frame[1] = NONE;
+          xd->mi[0]->mbmi.mode = NEWMV;
+          xd->mi[0]->mbmi.mv[0].as_mv = mv;
+          xd->mi[0]->mbmi.tx_size = TX_4X4;
+          xd->mi[0]->mbmi.ref_frame[0] = LAST_FRAME;
+          xd->mi[0]->mbmi.ref_frame[1] = NONE;
           vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
           vp9_encode_sby_pass1(x, bsize);
           sum_mvr += mv.row;
@@ -931,15 +961,20 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     vp9_clear_system_state();
   }
 
-  vp9_clear_system_state();
   {
     FIRSTPASS_STATS fps;
-    // The minimum error here insures some bit alocation to frames even
+    // The minimum error here insures some bit allocation to frames even
     // in static regions. The allocation per MB declines for larger formats
     // where the typical "real" energy per MB also falls.
     // Initial estimate here uses sqrt(mbs) to define the min_err, where the
-    // number of mbs is propotional to image area.
-    const double min_err = 200 * sqrt(cm->MBs);
+    // number of mbs is proportional to the image area.
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    const double min_err = 200 * sqrt(num_mbs);
+
+    intra_factor = intra_factor / (double)num_mbs;
+    brightness_factor = brightness_factor / (double)num_mbs;
+    fps.weight = intra_factor * brightness_factor;
 
     fps.frame = cm->current_video_frame;
     fps.spatial_layer_id = cpi->svc.spatial_layer_id;
@@ -947,9 +982,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
     fps.intra_error = (double)(intra_error >> 8) + min_err;
     fps.count = 1.0;
-    fps.pcnt_inter = (double)intercount / cm->MBs;
-    fps.pcnt_second_ref = (double)second_ref_count / cm->MBs;
-    fps.pcnt_neutral = (double)neutral_count / cm->MBs;
+    fps.pcnt_inter = (double)intercount / num_mbs;
+    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+    fps.pcnt_neutral = (double)neutral_count / num_mbs;
 
     if (mvcount > 0) {
       fps.MVr = (double)sum_mvr / mvcount;
@@ -960,7 +995,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount;
       fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
       fps.new_mv_count = new_mv_count;
-      fps.pcnt_motion = (double)mvcount / cm->MBs;
+      fps.pcnt_motion = (double)mvcount / num_mbs;
     } else {
       fps.MVr = 0.0;
       fps.mvr_abs = 0.0;
@@ -998,7 +1033,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
-      vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idx]);
     }
     twopass->sr_update_lag = 1;
   } else {
@@ -1010,14 +1046,17 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   if (lc != NULL) {
     vp9_update_reference_frames(cpi);
   } else {
-    // Swap frame pointers so last frame refers to the frame we just compressed.
-    swap_yv12(lst_yv12, new_yv12);
+    // The frame we just compressed now becomes the last frame.
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+               cm->new_fb_idx);
   }
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) {
-    vp8_yv12_copy_frame(lst_yv12, gld_yv12);
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
+      lc == NULL) {
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idx]);
   }
 
   // Use this to see what the first pass reconstruction looks like.
@@ -1066,16 +1105,17 @@ static double calc_correction_factor(double err_per_mb,
 #define EDIV_SIZE_FACTOR 800
 
 static int get_twopass_worst_quality(const VP9_COMP *cpi,
-                                     const FIRSTPASS_STATS *stats,
-                                     int section_target_bandwidth) {
+                                     const double section_err,
+                                     int section_target_bandwidth,
+                                     double group_weight_factor) {
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
   if (section_target_bandwidth <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
-    const int num_mbs = cpi->common.MBs;
-    const double section_err = stats->coded_error / stats->count;
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
     const double err_per_mb = section_err / num_mbs;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
     const double ediv_size_correction = num_mbs / EDIV_SIZE_FACTOR;
@@ -1084,20 +1124,24 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
 
     int q;
     int is_svc_upper_layer = 0;
+
     if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0)
       is_svc_upper_layer = 1;
 
+
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
-          calc_correction_factor(err_per_mb, ERR_DIVISOR - ediv_size_correction,
+          calc_correction_factor(err_per_mb,
+                                 ERR_DIVISOR - ediv_size_correction,
                                  is_svc_upper_layer ? SVC_FACTOR_PT_LOW :
                                  FACTOR_PT_LOW, FACTOR_PT_HIGH, q,
                                  cpi->common.bit_depth);
-      const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q,
-                                                 factor * speed_term,
-                                                 cpi->common.bit_depth);
+      const int bits_per_mb =
+        vp9_rc_bits_per_mb(INTER_FRAME, q,
+                           factor * speed_term * group_weight_factor,
+                           cpi->common.bit_depth);
       if (bits_per_mb <= target_norm_bits_per_mb)
         break;
     }
@@ -1109,7 +1153,38 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi,
   }
 }
 
-extern void vp9_new_framerate(VP9_COMP *cpi, double framerate);
+static void setup_rf_level_maxq(VP9_COMP *cpi) {
+  int i;
+  RATE_CONTROL *const rc = &cpi->rc;
+  for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+    int qdelta = vp9_frame_type_qdelta(cpi, i, rc->worst_quality);
+    rc->rf_level_maxq[i] = MAX(rc->worst_quality + qdelta, rc->best_quality);
+  }
+}
+
+void vp9_init_subsampling(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  RATE_CONTROL *const rc = &cpi->rc;
+  const int w = cm->width;
+  const int h = cm->height;
+  int i;
+
+  for (i = 0; i < FRAME_SCALE_STEPS; ++i) {
+    // Note: Frames with odd-sized dimensions may result from this scaling.
+    rc->frame_width[i] = (w * 16) / frame_scale_factor[i];
+    rc->frame_height[i] = (h * 16) / frame_scale_factor[i];
+  }
+
+  setup_rf_level_maxq(cpi);
+}
+
+void calculate_coded_size(VP9_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  *scaled_frame_width = rc->frame_width[rc->frame_size_selector];
+  *scaled_frame_height = rc->frame_height[rc->frame_size_selector];
+}
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
@@ -1179,6 +1254,10 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // Static sequence monitor variables.
   twopass->kf_zeromotion_pct = 100;
   twopass->last_kfgroup_zeromotion_pct = 100;
+
+  if (oxcf->resize_mode != RESIZE_NONE) {
+    vp9_init_subsampling(cpi);
+  }
 }
 
 #define SR_DIFF_PART 0.0015
@@ -1188,38 +1267,50 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 #define LOW_SR_DIFF_TRHESH 0.1
 #define SR_DIFF_MAX 128.0
 
-static double get_sr_decay_rate(const VP9_COMMON *cm,
+static double get_sr_decay_rate(const VP9_COMP *cpi,
                                 const FIRSTPASS_STATS *frame) {
-  double sr_diff = (frame->sr_coded_error - frame->coded_error) / cm->MBs;
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                      ? cpi->initial_mbs : cpi->common.MBs;
+  double sr_diff =
+      (frame->sr_coded_error - frame->coded_error) / num_mbs;
   double sr_decay = 1.0;
+  double modified_pct_inter;
+  double modified_pcnt_intra;
   const double motion_amplitude_factor =
     frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
-  const double pcnt_intra = 100 * (1.0 - frame->pcnt_inter);
+
+  modified_pct_inter = frame->pcnt_inter;
+  if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+      (double)NCOUNT_FRAME_II_THRESH) {
+    modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+  }
+  modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
 
   if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
     sr_diff = MIN(sr_diff, SR_DIFF_MAX);
     sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
                (MOTION_AMP_PART * motion_amplitude_factor) -
-               (INTRA_PART * pcnt_intra);
+               (INTRA_PART * modified_pcnt_intra);
   }
-  return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, frame->pcnt_inter));
+  return MAX(sr_decay, MIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
 }
 
 // This function gives an estimate of how badly we believe the prediction
 // quality is decaying from frame to frame.
-static double get_zero_motion_factor(const VP9_COMMON *cm,
+static double get_zero_motion_factor(const VP9_COMP *cpi,
                                      const FIRSTPASS_STATS *frame) {
   const double zero_motion_pct = frame->pcnt_inter -
                                  frame->pcnt_motion;
-  double sr_decay = get_sr_decay_rate(cm, frame);
+  double sr_decay = get_sr_decay_rate(cpi, frame);
   return MIN(sr_decay, zero_motion_pct);
 }
 
 #define ZM_POWER_FACTOR 0.75
 
-static double get_prediction_decay_rate(const VP9_COMMON *cm,
+static double get_prediction_decay_rate(const VP9_COMP *cpi,
                                         const FIRSTPASS_STATS *next_frame) {
-  const double sr_decay_rate = get_sr_decay_rate(cm, next_frame);
+  const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
   const double zero_motion_factor =
     (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
                 ZM_POWER_FACTOR));
@@ -1231,14 +1322,17 @@ static double get_prediction_decay_rate(const VP9_COMMON *cm,
 // Function to test for a condition where a complex transition is followed
 // by a static section. For example in slide shows where there is a fade
 // between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(const TWO_PASS *twopass,
+static int detect_transition_to_still(VP9_COMP *cpi,
                                       int frame_interval, int still_interval,
                                       double loop_decay_rate,
                                       double last_decay_rate) {
+  TWO_PASS *const twopass = &cpi->twopass;
+  RATE_CONTROL *const rc = &cpi->rc;
+
   // Break clause to detect very still sections after motion
   // For example a static image after a fade or other transition
   // instead of a clean scene cut.
-  if (frame_interval > MIN_GF_INTERVAL &&
+  if (frame_interval > rc->min_gf_interval &&
       loop_decay_rate >= 0.999 &&
       last_decay_rate < 0.9) {
     int j;
@@ -1313,12 +1407,14 @@ static double calc_frame_boost(VP9_COMP *cpi,
   const double lq =
     vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
                             cpi->common.bit_depth);
-  const double boost_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+  const double boost_q_correction = MIN((0.5 + (lq * 0.015)), 1.5);
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                      ? cpi->initial_mbs : cpi->common.MBs;
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (BASELINE_ERR_PER_MB * cpi->common.MBs) /
+  frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
-  frame_boost = frame_boost * BOOST_FACTOR * boost_correction;
+  frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
 
   // Increase boost for frames where new data coming into frame (e.g. zoom out).
   // Slightly reduce boost if there is a net balance of motion out of the frame
@@ -1329,7 +1425,7 @@ static double calc_frame_boost(VP9_COMP *cpi,
   else
     frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
 
-  return MIN(frame_boost, max_boost * boost_correction);
+  return MIN(frame_boost, max_boost * boost_q_correction);
 }
 
 static int calc_arf_boost(VP9_COMP *cpi, int offset,
@@ -1365,7 +1461,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                           ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1404,7 +1500,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
-      decay_accumulator *= get_prediction_decay_rate(&cpi->common, this_frame);
+      decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
       decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
                               ? MIN_DECAY_FACTOR : decay_accumulator;
     }
@@ -1666,8 +1762,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
 
 // Analyse and define a gf/arf group.
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+  VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
   FIRSTPASS_STATS next_frame;
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
@@ -1676,6 +1773,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double boost_score = 0.0;
   double old_boost_score = 0.0;
   double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+  double gf_group_raw_error = 0.0;
+#endif
   double gf_first_frame_err = 0.0;
   double mod_frame_err = 0.0;
 
@@ -1700,10 +1800,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int64_t gf_group_bits;
   double gf_group_error_left;
   int gf_arf_bits;
+  int is_key_frame = frame_is_intra_only(cm);
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
-  if (cpi->common.frame_type != KEY_FRAME) {
+  if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
   }
 
@@ -1719,11 +1820,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // If this is a key frame or the overlay from a previous arf then
   // the error score / cost of this frame has already been accounted for.
-  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+  if (is_key_frame || rc->source_alt_ref_active) {
     gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error -= this_frame->coded_error;
+#endif
+  }
 
   // Motion breakout threshold for loop below depends on image size.
-  mv_ratio_accumulator_thresh = (cpi->common.width + cpi->common.height) / 4.0;
+  mv_ratio_accumulator_thresh =
+      (cpi->initial_height + cpi->initial_width) / 4.0;
 
   // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
@@ -1734,7 +1840,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     int int_lbq =
       (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
                                    cpi->common.bit_depth));
-    active_min_gf_interval = MIN_GF_INTERVAL + MIN(2, int_max_q / 200);
+    active_min_gf_interval = rc->min_gf_interval + MIN(2, int_max_q / 200);
     if (active_min_gf_interval > rc->max_gf_interval)
       active_min_gf_interval = rc->max_gf_interval;
 
@@ -1748,6 +1854,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
       if (active_max_gf_interval > rc->max_gf_interval)
         active_max_gf_interval = rc->max_gf_interval;
+      if (active_max_gf_interval < active_min_gf_interval)
+        active_max_gf_interval = active_min_gf_interval;
     }
   }
 
@@ -1758,6 +1866,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Accumulate error score of frames in this gf group.
     mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+    gf_group_raw_error += this_frame->coded_error;
+#endif
 
     if (EOF == input_stats(twopass, &next_frame))
       break;
@@ -1775,18 +1886,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
-      loop_decay_rate = get_prediction_decay_rate(&cpi->common, &next_frame);
+      loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
       decay_accumulator = decay_accumulator * loop_decay_rate;
 
       // Monitor for static sections.
       zero_motion_accumulator =
-        MIN(zero_motion_accumulator,
-            get_zero_motion_factor(&cpi->common, &next_frame));
+        MIN(zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
 
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
-      if (detect_transition_to_still(twopass, i, 5, loop_decay_rate,
+      if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
                                      last_loop_decay_rate)) {
         allow_alt_ref = 0;
         break;
@@ -1820,8 +1930,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
 
+  // Was the group length constrained by the requirement for a new KF?
+  rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
   // Set the interval until the next gf.
-  if (cpi->common.frame_type == KEY_FRAME || rc->source_alt_ref_active)
+  if (is_key_frame || rc->source_alt_ref_active)
     rc->baseline_gf_interval = i - 1;
   else
     rc->baseline_gf_interval = i;
@@ -1837,6 +1950,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       if (EOF == input_stats(twopass, this_frame))
         break;
       gf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+#if GROUP_ADAPTIVE_MAXQ
+      gf_group_raw_error += this_frame->coded_error;
+#endif
     }
     rc->baseline_gf_interval = new_gf_interval;
   }
@@ -1846,7 +1962,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Should we use the alternate reference frame.
   if (allow_alt_ref &&
       (i < cpi->oxcf.lag_in_frames) &&
-      (i >= MIN_GF_INTERVAL)) {
+      (i >= rc->min_gf_interval)) {
     // Calculate the boost for alt ref.
     rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
                                    &b_boost);
@@ -1867,6 +1983,34 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
+#if GROUP_ADAPTIVE_MAXQ
+  // Calculate an estimate of the maxq needed for the group.
+  // We are more agressive about correcting for sections
+  // where there could be significant overshoot than for easier
+  // sections where we do not wish to risk creating an overshoot
+  // of the allocated bit budget.
+  if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
+    const int vbr_group_bits_per_frame =
+      (int)(gf_group_bits / rc->baseline_gf_interval);
+    const double group_av_err = gf_group_raw_error  / rc->baseline_gf_interval;
+    int tmp_q;
+    // rc factor is a weight factor that corrects for local rate control drift.
+    double rc_factor = 1.0;
+    if (rc->rate_error_estimate > 0) {
+      rc_factor = MAX(RC_FACTOR_MIN,
+                      (double)(100 - rc->rate_error_estimate) / 100.0);
+    } else {
+      rc_factor = MIN(RC_FACTOR_MAX,
+                      (double)(100 - rc->rate_error_estimate) / 100.0);
+    }
+    tmp_q =
+      get_twopass_worst_quality(cpi, group_av_err, vbr_group_bits_per_frame,
+                                twopass->kfgroup_inter_fraction * rc_factor);
+    twopass->active_worst_quality =
+      MAX(tmp_q, twopass->active_worst_quality >> 1);
+  }
+#endif
+
   // Calculate the extra bits to be used for boosted frame(s)
   gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval,
                                      rc->gfu_boost, gf_group_bits);
@@ -1882,7 +2026,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // also a key frame in which case it has already been accounted for.
   if (rc->source_alt_ref_pending) {
     gf_group_error_left = gf_group_err - mod_frame_err;
-  } else if (cpi->common.frame_type != KEY_FRAME) {
+  } else if (is_key_frame == 0) {
     gf_group_error_left = gf_group_err - gf_first_frame_err;
   } else {
     gf_group_error_left = gf_group_err;
@@ -1900,31 +2044,68 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         calculate_section_intra_ratio(start_pos, twopass->stats_in_end,
                                       rc->baseline_gf_interval);
   }
+
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to starting GF groups at normal frame size.
+    cpi->rc.next_frame_size_selector = UNSCALED;
+  }
 }
 
-// TODO(PGW) Re-examine the use of II ration in this code in the light of#
-// changes elsewhere
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
 #define KF_II_MAX 128.0
+
 static int test_candidate_kf(TWO_PASS *twopass,
                              const FIRSTPASS_STATS *last_frame,
                              const FIRSTPASS_STATS *this_frame,
                              const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
+  double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double modified_pcnt_inter =
+    this_frame->pcnt_inter - this_frame->pcnt_neutral;
 
   // Does the frame satisfy the primary criteria of a key frame?
+  // See above for an explanation of the test criteria.
   // If so, then examine how well it predicts subsequent frames.
-  if ((this_frame->pcnt_second_ref < 0.10) &&
-      (next_frame->pcnt_second_ref < 0.10) &&
-      ((this_frame->pcnt_inter < 0.05) ||
-       (((this_frame->pcnt_inter - this_frame->pcnt_neutral) < 0.35) &&
+  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+       ((pcnt_intra > MIN_INTRA_LEVEL) &&
+        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
         ((this_frame->intra_error /
-          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+          KF_II_ERR_THRESHOLD) &&
         ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > 0.40) ||
+          DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+          ERR_CHANGE_THRESHOLD) ||
          (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > 0.40) ||
+          DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+          ERR_CHANGE_THRESHOLD) ||
          ((next_frame->intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5))))) {
+          DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+          II_IMPROVEMENT_THRESHOLD))))) {
     int i;
     const FIRSTPASS_STATS *start_pos = twopass->stats_in;
     FIRSTPASS_STATS local_next_frame = *next_frame;
@@ -2048,8 +2229,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         break;
 
       // How fast is the prediction quality decaying?
-      loop_decay_rate = get_prediction_decay_rate(&cpi->common,
-                                                  twopass->stats_in);
+      loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
 
       // We want to know something about the recent past... rather than
       // as used elsewhere where we are concerned with decay in prediction
@@ -2061,7 +2241,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
       // Special check for transition or high motion followed by a
       // static scene.
-      if (detect_transition_to_still(twopass, i, cpi->oxcf.key_freq - i,
+      if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
                                      loop_decay_rate, decay_accumulator))
         break;
 
@@ -2091,7 +2271,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Reset to the start of the group.
     reset_fpf_position(twopass, start_position);
 
-    kf_group_err = 0;
+    kf_group_err = 0.0;
 
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
@@ -2160,7 +2340,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Monitor for static sections.
     zero_motion_accumulator =
       MIN(zero_motion_accumulator,
-          get_zero_motion_factor(&cpi->common, &next_frame));
+          get_zero_motion_factor(cpi, &next_frame));
 
     // Not all frames in the group are necessarily used in calculating boost.
     if ((i <= rc->max_gf_interval) ||
@@ -2171,7 +2351,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       // How fast is prediction quality decaying.
       if (!detect_flash(twopass, 0)) {
         const double loop_decay_rate =
-          get_prediction_decay_rate(&cpi->common, &next_frame);
+          get_prediction_decay_rate(cpi, &next_frame);
         decay_accumulator *= loop_decay_rate;
         decay_accumulator = MAX(decay_accumulator, MIN_DECAY_FACTOR);
         av_decay_accumulator += decay_accumulator;
@@ -2201,6 +2381,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1),
                                   rc->kf_boost, twopass->kf_group_bits);
 
+  // Work out the fraction of the kf group bits reserved for the inter frames
+  // within the group after discounting the bits for the kf itself.
+  if (twopass->kf_group_bits) {
+    twopass->kfgroup_inter_fraction =
+      (double)(twopass->kf_group_bits - kf_bits) /
+      (double)twopass->kf_group_bits;
+  } else {
+    twopass->kfgroup_inter_fraction = 1.0;
+  }
+
   twopass->kf_group_bits -= kf_bits;
 
   // Save the bits to spend on the key frame.
@@ -2215,35 +2405,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // The count of bits left is adjusted elsewhere based on real coded frame
   // sizes.
   twopass->modified_error_left -= kf_group_err;
-}
 
-#define VBR_PCT_ADJUSTMENT_LIMIT 50
-// For VBR...adjustment to the frame target based on error from previous frames
-void vbr_rate_correction(VP9_COMP *cpi,
-                         int * this_frame_target,
-                         const int64_t vbr_bits_off_target) {
-  int max_delta;
-  double position_factor = 1.0;
-
-  // How far through the clip are we.
-  // This number is used to damp the per frame rate correction.
-  // Range 0 - 1.0
-  if (cpi->twopass.total_stats.count) {
-    position_factor = sqrt((double)cpi->common.current_video_frame /
-                           cpi->twopass.total_stats.count);
-  }
-  max_delta = (int)(position_factor *
-                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
-
-  // vbr_bits_off_target > 0 means we have extra bits to spend
-  if (vbr_bits_off_target > 0) {
-    *this_frame_target +=
-      (vbr_bits_off_target > max_delta) ? max_delta
-                                        : (int)vbr_bits_off_target;
-  } else {
-    *this_frame_target -=
-      (vbr_bits_off_target < -max_delta) ? max_delta
-                                         : (int)-vbr_bits_off_target;
+  if (oxcf->resize_mode == RESIZE_DYNAMIC) {
+    // Default to normal-sized frame on keyframes.
+    cpi->rc.next_frame_size_selector = UNSCALED;
   }
 }
 
@@ -2295,6 +2460,24 @@ void configure_buffer_updates(VP9_COMP *cpi) {
   }
 }
 
+int is_skippable_frame(const VP9_COMP *cpi) {
+  // If the current frame does not have non-zero motion vector detected in the
+  // first  pass, and so do its previous and forward frames, then this frame
+  // can be skipped for partition check, and the partition size is assigned
+  // according to the variance
+  const SVC *const svc = &cpi->svc;
+  const TWO_PASS *const twopass = is_two_pass_svc(cpi) ?
+      &svc->layer_context[svc->spatial_layer_id].twopass : &cpi->twopass;
+
+  return (!frame_is_intra_only(&cpi->common) &&
+    twopass->stats_in - 2 > twopass->stats_in_start &&
+    twopass->stats_in < twopass->stats_in_end &&
+    (twopass->stats_in - 1)->pcnt_inter - (twopass->stats_in - 1)->pcnt_motion
+    == 1 &&
+    (twopass->stats_in - 2)->pcnt_inter - (twopass->stats_in - 2)->pcnt_motion
+    == 1 &&
+    twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
 
 void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -2303,7 +2486,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   GF_GROUP *const gf_group = &twopass->gf_group;
   int frames_left;
   FIRSTPASS_STATS this_frame;
-  FIRSTPASS_STATS this_frame_copy;
 
   int target_rate;
   LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ?
@@ -2329,11 +2511,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
-    // Correction to rate target based on prior over or under shoot.
-    if (cpi->oxcf.rc_mode == VPX_VBR)
-      vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
-
-    vp9_rc_set_frame_target(cpi, target_rate);
     cm->frame_type = INTER_FRAME;
 
     if (lc != NULL) {
@@ -2347,6 +2524,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
       }
     }
 
+    // Do the firstpass stats indicate that this frame is skippable for the
+    // partition search?
+    if (cpi->sf.allow_partition_search_skip &&
+        cpi->oxcf.pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+      cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+    }
+
     return;
   }
 
@@ -2359,9 +2543,14 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     // Special case code for first frame.
     const int section_target_bandwidth = (int)(twopass->bits_left /
                                                frames_left);
-    const int tmp_q = get_twopass_worst_quality(cpi, &twopass->total_left_stats,
-                                                section_target_bandwidth);
+    const double section_error =
+      twopass->total_left_stats.coded_error / twopass->total_left_stats.count;
+    const int tmp_q =
+      get_twopass_worst_quality(cpi, section_error,
+                                section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
     twopass->active_worst_quality = tmp_q;
+    twopass->baseline_active_worst_quality = tmp_q;
     rc->ni_av_qi = tmp_q;
     rc->last_q[INTER_FRAME] = tmp_q;
     rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth);
@@ -2373,14 +2562,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   if (EOF == input_stats(twopass, &this_frame))
     return;
 
-  // Local copy of the current frame's first pass stats.
-  this_frame_copy = this_frame;
-
   // Keyframe and section processing.
-  if (rc->frames_to_key == 0 ||
-      (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+  if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+    FIRSTPASS_STATS this_frame_copy;
+    this_frame_copy = this_frame;
     // Define next KF group and assign bits to it.
-    find_next_key_frame(cpi, &this_frame_copy);
+    find_next_key_frame(cpi, &this_frame);
+    this_frame = this_frame_copy;
   } else {
     cm->frame_type = INTER_FRAME;
   }
@@ -2409,7 +2597,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
-    define_gf_group(cpi, &this_frame_copy);
+    define_gf_group(cpi, &this_frame);
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     if (lc != NULL)
@@ -2431,6 +2619,13 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   configure_buffer_updates(cpi);
 
+  // Do the firstpass stats indicate that this frame is skippable for the
+  // partition search?
+  if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
+      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+    cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+  }
+
   target_rate = gf_group->bit_allocation[gf_group->index];
   if (cpi->common.frame_type == KEY_FRAME)
     target_rate = vp9_rc_clamp_iframe_target_size(cpi, target_rate);
@@ -2439,18 +2634,21 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   rc->base_frame_target = target_rate;
 
-  // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == VPX_VBR)
-    vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
-
-  vp9_rc_set_frame_target(cpi, target_rate);
+  {
+    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                        ? cpi->initial_mbs : cpi->common.MBs;
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy =
+      log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
+  }
 
   // Update the total stats remaining structure.
   subtract_stats(&twopass->total_left_stats, &this_frame);
 }
 
-#define MINQ_ADJ_LIMIT 32
-#define Q_LIMIT_STEP 1
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2483,31 +2681,39 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   // Increment the gf group index ready for the next frame.
   ++twopass->gf_group.index;
 
-  // If the rate control is drifting consider adjustment ot min or maxq.
-  // Only make adjustments on gf/arf
-  if ((cpi->oxcf.rc_mode == VPX_VBR) &&
+  // If the rate control is drifting consider adjustment to min or maxq.
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
       (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
       !cpi->rc.is_src_frame_alt_ref) {
     const int maxq_adj_limit =
       rc->worst_quality - twopass->active_worst_quality;
+    const int minq_adj_limit =
+        (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
 
     // Undershoot.
     if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
       --twopass->extend_maxq;
       if (rc->rolling_target_bits >= rc->rolling_actual_bits)
-        twopass->extend_minq += Q_LIMIT_STEP;
+        ++twopass->extend_minq;
     // Overshoot.
     } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
       --twopass->extend_minq;
       if (rc->rolling_target_bits < rc->rolling_actual_bits)
-        twopass->extend_maxq += Q_LIMIT_STEP;
+        ++twopass->extend_maxq;
     } else {
+      // Adjustment for extreme local overshoot.
+      if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+          rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+        ++twopass->extend_maxq;
+
+      // Unwind undershoot or overshoot adjustment.
       if (rc->rolling_target_bits < rc->rolling_actual_bits)
         --twopass->extend_minq;
-      if (rc->rolling_target_bits > rc->rolling_actual_bits)
+      else if (rc->rolling_target_bits > rc->rolling_actual_bits)
         --twopass->extend_maxq;
     }
-    twopass->extend_minq = clamp(twopass->extend_minq, 0, MINQ_ADJ_LIMIT);
+
+    twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
     twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
index e21d86928da..08e7a8bf114 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h
@@ -43,6 +43,7 @@ typedef struct {
 
 typedef struct {
   double frame;
+  double weight;
   double intra_error;
   double coded_error;
   double sr_coded_error;
@@ -95,6 +96,7 @@ typedef struct {
   double modified_error_min;
   double modified_error_max;
   double modified_error_left;
+  double mb_av_energy;
 
 #if CONFIG_FP_MB_STATS
   uint8_t *frame_mb_stats_buf;
@@ -107,12 +109,17 @@ typedef struct {
 
   // Error score of frames still to be coded in kf group
   int64_t kf_group_error_left;
+
+  // The fraction for a kf groups total bits allocated to the inter frames
+  double kfgroup_inter_fraction;
+
   int sr_update_lag;
 
   int kf_zeromotion_pct;
   int last_kfgroup_zeromotion_pct;
   int gf_zeromotion_pct;
   int active_worst_quality;
+  int baseline_active_worst_quality;
   int extend_minq;
   int extend_maxq;
 
@@ -131,6 +138,13 @@ void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
 
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
+
+void vp9_init_subsampling(struct VP9_COMP *cpi);
+
+void calculate_coded_size(struct VP9_COMP *cpi,
+                          int *scaled_frame_width,
+                          int *scaled_frame_height);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
index 823e7a16242..b8e2ca88c8c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.c
@@ -65,6 +65,7 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
   // Allocate the lookahead structures
   ctx = calloc(1, sizeof(*ctx));
   if (ctx) {
+    const int legacy_byte_alignment = 0;
     unsigned int i;
     ctx->max_sz = depth;
     ctx->buf = calloc(depth, sizeof(*ctx->buf));
@@ -76,7 +77,8 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
 #if CONFIG_VP9_HIGHBITDEPTH
                                  use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS))
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 legacy_byte_alignment))
         goto bail;
   }
   return ctx;
@@ -88,19 +90,40 @@ struct lookahead_ctx *vp9_lookahead_init(unsigned int width,
 #define USE_PARTIAL_COPY 0
 
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags) {
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags) {
   struct lookahead_entry *buf;
 #if USE_PARTIAL_COPY
   int row, col, active_end;
   int mb_rows = (src->y_height + 15) >> 4;
   int mb_cols = (src->y_width + 15) >> 4;
 #endif
+  int width = src->y_crop_width;
+  int height = src->y_crop_height;
+  int uv_width = src->uv_crop_width;
+  int uv_height = src->uv_crop_height;
+  int subsampling_x = src->subsampling_x;
+  int subsampling_y = src->subsampling_y;
+  int larger_dimensions, new_dimensions;
 
   if (ctx->sz + 1  + MAX_PRE_FRAMES > ctx->max_sz)
     return 1;
   ctx->sz++;
   buf = pop(ctx, &ctx->write_idx);
 
+  new_dimensions = width != buf->img.y_crop_width ||
+                   height != buf->img.y_crop_height ||
+                   uv_width != buf->img.uv_crop_width ||
+                   uv_height != buf->img.uv_crop_height;
+  larger_dimensions = width > buf->img.y_width ||
+                      height > buf->img.y_height ||
+                      uv_width > buf->img.uv_width ||
+                      uv_height > buf->img.uv_height;
+  assert(!larger_dimensions || new_dimensions);
+
 #if USE_PARTIAL_COPY
   // TODO(jkoleszar): This is disabled for now, as
   // vp9_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
@@ -109,7 +132,7 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
   // 1. Lookahead queue has has size of 1.
   // 2. Active map is provided.
   // 3. This is not a key frame, golden nor altref frame.
-  if (ctx->max_sz == 1 && active_map && !flags) {
+  if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
     for (row = 0; row < mb_rows; ++row) {
       col = 0;
 
@@ -145,11 +168,32 @@ int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG   *src,
       active_map += mb_cols;
     }
   } else {
+#endif
+    if (larger_dimensions) {
+      YV12_BUFFER_CONFIG new_img;
+      memset(&new_img, 0, sizeof(new_img));
+      if (vp9_alloc_frame_buffer(&new_img,
+                                 width, height, subsampling_x, subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 use_highbitdepth,
+#endif
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 0))
+          return 1;
+      vp9_free_frame_buffer(&buf->img);
+      buf->img = new_img;
+    } else if (new_dimensions) {
+      buf->img.y_crop_width = src->y_crop_width;
+      buf->img.y_crop_height = src->y_crop_height;
+      buf->img.uv_crop_width = src->uv_crop_width;
+      buf->img.uv_crop_height = src->uv_crop_height;
+      buf->img.subsampling_x = src->subsampling_x;
+      buf->img.subsampling_y = src->subsampling_y;
+    }
+    // Partial copy not implemented yet
     vp9_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
   }
-#else
-  // Partial copy not implemented yet
-  vp9_copy_and_extend_frame(src, &buf->img);
 #endif
 
   buf->ts_start = ts_start;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
index a33d3002e5b..13820380ff4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_lookahead.h
@@ -79,7 +79,11 @@ void vp9_lookahead_destroy(struct lookahead_ctx *ctx);
  * \param[in] active_map  Map that specifies which macroblock is active
  */
 int vp9_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
-                       int64_t ts_start, int64_t ts_end, unsigned int flags);
+                       int64_t ts_start, int64_t ts_end,
+#if CONFIG_VP9_HIGHBITDEPTH
+                       int use_highbitdepth,
+#endif
+                       unsigned int flags);
 
 
 /**\brief Get the next source buffer to encode
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
index bd04c56a47d..d5eeb9cc546 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -10,6 +10,9 @@
 
 #include <limits.h>
 
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
 #include "vpx_mem/vpx_mem.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -24,7 +27,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
                                               MV *dst_mv,
                                               int mb_row,
                                               int mb_col) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
   const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@@ -63,8 +66,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
         &distortion, &sse, NULL, 0, 0);
   }
 
-  xd->mi[0].src_mi->mbmi.mode = NEWMV;
-  xd->mi[0].src_mi->mbmi.mv[0].as_mv = *dst_mv;
+  xd->mi[0]->mbmi.mode = NEWMV;
+  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
 
   vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
 
@@ -74,20 +77,20 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   x->mv_row_min = tmp_row_min;
   x->mv_row_max = tmp_row_max;
 
-  return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-          xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+  return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+                      xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 }
 
 static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
                                   int_mv *dst_mv, int mb_row, int mb_col) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
   MV tmp_mv;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
-  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                      xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
   dst_mv->as_int = 0;
 
@@ -117,13 +120,13 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
 }
 
 static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
-  err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+  err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                      xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
 
   dst_mv->as_int = 0;
@@ -131,7 +134,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
   return err;
 }
 static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
-  MACROBLOCK   *const x  = &cpi->mb;
+  MACROBLOCK   *const x  = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   PREDICTION_MODE best_mode = -1, mode;
   unsigned int best_err = INT_MAX;
@@ -141,12 +144,12 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) {
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     unsigned int err;
 
-    xd->mi[0].src_mi->mbmi.mode = mode;
+    xd->mi[0]->mbmi.mode = mode;
     vp9_predict_intra_block(xd, 0, 2, TX_16X16, mode,
                             x->plane[0].src.buf, x->plane[0].src.stride,
                             xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                             0, 0, 0);
-    err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+    err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                        xd->plane[0].dst.buf, xd->plane[0].dst.stride);
 
     // find best
@@ -174,7 +177,7 @@ static void update_mbgraph_mb_stats
   int mb_row,
   int mb_col
 ) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int intra_error;
   VP9_COMMON *cm = &cpi->common;
@@ -229,7 +232,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
                                        YV12_BUFFER_CONFIG *buf,
                                        YV12_BUFFER_CONFIG *golden_ref,
                                        YV12_BUFFER_CONFIG *alt_ref) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   VP9_COMMON *const cm = &cpi->common;
 
@@ -247,7 +250,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->plane[0].dst.stride  = buf->y_stride;
   xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
-  xd->mi[0].src_mi = &mi_local;
+  xd->mi[0] = &mi_local;
   mi_local.mbmi.sb_type = BLOCK_16X16;
   mi_local.mbmi.ref_frame[0] = LAST_FRAME;
   mi_local.mbmi.ref_frame[1] = NONE;
@@ -376,6 +379,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   int i, n_frames = vp9_lookahead_depth(cpi->lookahead);
   YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
+  assert(golden_ref != NULL);
+
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
   if (n_frames <= cpi->rc.frames_till_gf_update_due)
@@ -387,9 +392,8 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
   cpi->mbgraph_n_frames = n_frames;
   for (i = 0; i < n_frames; i++) {
     MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
-    vpx_memset(frame_stats->mb_stats, 0,
-               cm->mb_rows * cm->mb_cols *
-               sizeof(*cpi->mbgraph_stats[i].mb_stats));
+    memset(frame_stats->mb_stats, 0,
+           cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
   }
 
   // do motion search to find contribution of each reference to data
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
index 69b4193840d..80c509a1b49 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -90,13 +90,10 @@ static int mv_err_cost(const MV *mv, const MV *ref,
 
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
                           int error_per_bit) {
-  if (x->nmvsadcost) {
-    const MV diff = { mv->row - ref->row,
-                      mv->col - ref->col };
-    return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
-                                      x->nmvsadcost) * error_per_bit, 8);
-  }
-  return 0;
+  const MV diff = { mv->row - ref->row,
+                    mv->col - ref->col };
+  return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
+                                    x->nmvsadcost) * error_per_bit, 8);
 }
 
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
@@ -286,42 +283,53 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   bestmv->row *= 8;                                                        \
   bestmv->col *= 8;
 
+static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
+                                              const MV *bestmv,
+                                              const MV *ref_mv,
+                                              int error_per_bit,
+                                              const vp9_variance_fn_ptr_t *vfp,
+                                              const uint8_t *const src,
+                                              const int src_stride,
+                                              const uint8_t *const y,
+                                              int y_stride,
+                                              const uint8_t *second_pred,
+                                              int w, int h, int offset,
+                                              int *mvjcost, int *mvcost[2],
+                                              unsigned int *sse1,
+                                              int *distortion) {
+  unsigned int besterr;
 #if CONFIG_VP9_HIGHBITDEPTH
-#define SETUP_CENTER_ERROR                                                   \
-  if (second_pred != NULL) {                                                 \
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {                       \
-      DECLARE_ALIGNED_ARRAY(16, uint16_t, comp_pred16, 64 * 64);             \
-      vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,   \
-                               y_stride);                                    \
-      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, z, src_stride,   \
-                        sse1);                                               \
-    } else {                                                                 \
-      DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                \
-      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride); \
-      besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                  \
-    }                                                                        \
-  } else {                                                                   \
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
-  }                                                                          \
-  *distortion = besterr;                                                     \
+  if (second_pred != NULL) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+                               y_stride);
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
+                        sse1);
+    } else {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+    }
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
-
 #else
-
-#define SETUP_CENTER_ERROR                                                   \
-  if (second_pred != NULL) {                                                 \
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);                  \
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);   \
-    besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);                    \
-  } else {                                                                   \
-    besterr = vfp->vf(y + offset, y_stride, z, src_stride, sse1);            \
-  }                                                                          \
-  *distortion = besterr;                                                     \
+  (void) xd;
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+  } else {
+    besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+  }
+  *distortion = besterr;
   besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-
-
+  return besterr;
+}
 
 static INLINE int divide_and_round(const int n, const int d) {
   return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
@@ -365,7 +373,10 @@ int vp9_find_best_sub_pixel_tree_pruned_evenmore(
     const uint8_t *second_pred,
     int w, int h) {
   SETUP_SUBPEL_SEARCH;
-  SETUP_CENTER_ERROR;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
   (void) halfiters;
   (void) quarteriters;
   (void) eighthiters;
@@ -441,7 +452,10 @@ int vp9_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
                                              const uint8_t *second_pred,
                                              int w, int h) {
   SETUP_SUBPEL_SEARCH;
-  SETUP_CENTER_ERROR;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
   if (cost_list &&
       cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -512,7 +526,10 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
                                         const uint8_t *second_pred,
                                         int w, int h) {
   SETUP_SUBPEL_SEARCH;
-  SETUP_CENTER_ERROR;
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
   if (cost_list &&
       cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -590,6 +607,13 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
   return besterr;
 }
 
+const MV search_step_table[12] = {
+    // left, right, up, down
+    {0, -4}, {0, 4}, {-4, 0}, {4, 0},
+    {0, -2}, {0, 2}, {-2, 0}, {2, 0},
+    {0, -1}, {0, 1}, {-1, 0}, {1, 0}
+};
+
 int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  MV *bestmv, const MV *ref_mv,
                                  int allow_hp,
@@ -603,43 +627,129 @@ int vp9_find_best_sub_pixel_tree(const MACROBLOCK *x,
                                  unsigned int *sse1,
                                  const uint8_t *second_pred,
                                  int w, int h) {
-  SETUP_SUBPEL_SEARCH;
-  SETUP_CENTER_ERROR;
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int whichdir = 0;
+  int thismse;
+  const int y_stride = xd->plane[0].pre[0].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter, round = 3 - forced_stop;
+  const int minc = MAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = MIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = MAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = MIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+
+  if (!(allow_hp && vp9_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                               z, src_stride, y, y_stride, second_pred,
+                               w, h, offset, mvjcost, mvcost,
+                               sse1, distortion);
+
   (void) cost_list;  // to silence compiler warning
 
-  // Each subsequent iteration checks at least one point in
-  // common with the last iteration could be 2 ( if diag selected)
-  // 1/2 pel
-  FIRST_LEVEL_CHECKS;
-  if (halfiters > 1) {
-    SECOND_LEVEL_CHECKS;
-  }
-  tr = br;
-  tc = bc;
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        int row_offset = (tr & 0x07) << 1;
+        int col_offset = (tc & 0x07) << 1;
+        MV this_mv;
+        this_mv.row = tr;
+        this_mv.col = tc;
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+                              src_address, src_stride, &sse, second_pred);
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
 
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
+    // Check diagonal sub-pixel position
+    tc = bc + (cost_array[0] < cost_array[1] ? -hstep : hstep);
+    tr = br + (cost_array[2] < cost_array[3] ? -hstep : hstep);
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+      int row_offset = (tr & 0x07) << 1;
+      int col_offset = (tc & 0x07) << 1;
+      MV this_mv = {tr, tc};
+      if (second_pred == NULL)
+        thismse = vfp->svf(pre_address, y_stride, col_offset, row_offset,
+                           src_address, src_stride, &sse);
+      else
+        thismse = vfp->svaf(pre_address, y_stride, col_offset, row_offset,
+                            src_address, src_stride, &sse, second_pred);
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
 
-  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
-  if (forced_stop != 2) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (quarteriters > 1) {
-      SECOND_LEVEL_CHECKS;
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
     }
-    tr = br;
-    tc = bc;
-  }
 
-  if (allow_hp && vp9_use_mv_hp(ref_mv) && forced_stop == 0) {
-    hstep >>= 1;
-    FIRST_LEVEL_CHECKS;
-    if (eighthiters > 1) {
+    if (iters_per_step > 1)
       SECOND_LEVEL_CHECKS;
-    }
+
     tr = br;
     tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
   }
+
+  // Each subsequent iteration checks at least one point in common with
+  // the last iteration could be 2 ( if diag selected) 1/4 pel
+
   // These lines insure static analysis doesn't warn that
   // tr and tc aren't used after the above point.
   (void) tr;
@@ -1604,6 +1714,184 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x,
   return bestsad;
 }
 
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+  int best_sad = INT_MAX;
+  int this_sad;
+  int d;
+  int center, offset = 0;
+  int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.
+  for (d = 0; d <= bw; d += 16) {
+    this_sad = vp9_vector_var(&ref[d], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      offset = d;
+    }
+  }
+  center = offset;
+
+  for (d = -8; d <= 8; d += 16) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -4; d <= 4; d += 8) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -2; d <= 2; d += 4) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+  offset = center;
+
+  for (d = -1; d <= 1; d += 2) {
+    int this_pos = offset + d;
+    // check limit
+    if (this_pos < 0 || this_pos > bw)
+      continue;
+    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);
+    if (this_sad < best_sad) {
+      best_sad = this_sad;
+      center = this_pos;
+    }
+  }
+
+  return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+    {-1, 0}, {0, -1}, {0, 1}, {1, 0},
+};
+
+unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
+                                           BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  int idx;
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int search_width = bw << 1;
+  const int search_height = bh << 1;
+  const int src_stride = x->plane[0].src.stride;
+  const int ref_stride = xd->plane[0].pre[0].stride;
+  uint8_t const *ref_buf, *src_buf;
+  MV *tmp_mv = &xd->mi[0]->mbmi.mv[0].as_mv;
+  unsigned int best_sad, tmp_sad, this_sad[4];
+  MV this_mv;
+  const int norm_factor = 3 + (bw >> 5);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  tmp_mv->row = 0;
+  tmp_mv->col = 0;
+  return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                xd->plane[0].pre[0].buf, ref_stride);
+#endif
+
+  // Set up prediction 1-D reference set
+  ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+  for (idx = 0; idx < search_width; idx += 16) {
+    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+    ref_buf += 16;
+  }
+
+  ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+  for (idx = 0; idx < search_height; ++idx) {
+    vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor;
+    ref_buf += ref_stride;
+  }
+
+  // Set up src 1-D reference set
+  for (idx = 0; idx < bw; idx += 16) {
+    src_buf = x->plane[0].src.buf + idx;
+    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+  }
+
+  src_buf = x->plane[0].src.buf;
+  for (idx = 0; idx < bh; ++idx) {
+    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor;
+    src_buf += src_stride;
+  }
+
+  // Find the best match per 1-D search
+  tmp_mv->col = vector_match(hbuf, src_hbuf, b_width_log2_lookup[bsize]);
+  tmp_mv->row = vector_match(vbuf, src_vbuf, b_height_log2_lookup[bsize]);
+
+  this_mv = *tmp_mv;
+  src_buf = x->plane[0].src.buf;
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+  best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+  {
+    const uint8_t * const pos[4] = {
+        ref_buf - ref_stride,
+        ref_buf - 1,
+        ref_buf + 1,
+        ref_buf + ref_stride,
+    };
+
+    cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+  }
+
+  for (idx = 0; idx < 4; ++idx) {
+    if (this_sad[idx] < best_sad) {
+      best_sad = this_sad[idx];
+      tmp_mv->row = search_pos[idx].row + this_mv.row;
+      tmp_mv->col = search_pos[idx].col + this_mv.col;
+    }
+  }
+
+  if (this_sad[0] < this_sad[3])
+    this_mv.row -= 1;
+  else
+    this_mv.row += 1;
+
+  if (this_sad[1] < this_sad[2])
+    this_mv.col -= 1;
+  else
+    this_mv.col += 1;
+
+  ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+  tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride,
+                                   ref_buf, ref_stride);
+  if (best_sad > tmp_sad) {
+    *tmp_mv = this_mv;
+    best_sad = tmp_sad;
+  }
+
+  tmp_mv->row *= 8;
+  tmp_mv->col *= 8;
+
+  return best_sad;
+}
+
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
@@ -1654,7 +1942,7 @@ int vp9_full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x,
   if (do_refine) {
     const int search_range = 8;
     MV best_mv = *dst_mv;
-    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
@@ -1729,7 +2017,7 @@ int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
     if (fn_ptr->sdx3f != NULL) {
       while ((c + 2) < col_max) {
         int i;
-        unsigned int sads[3];
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
 
         fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -1794,7 +2082,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
     if (fn_ptr->sdx8f != NULL) {
       while ((c + 7) < col_max) {
         int i;
-        unsigned int sads[8];
+        DECLARE_ALIGNED(16, uint32_t, sads[8]);
 
         fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -1818,7 +2106,7 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
     if (fn_ptr->sdx3f != NULL) {
       while ((c + 2) < col_max) {
         int i;
-        unsigned int sads[3];
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
 
         fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -1858,11 +2146,11 @@ int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
   return best_sad;
 }
 
-int vp9_refining_search_sad_c(const MACROBLOCK *x,
-                              MV *ref_mv, int error_per_bit,
-                              int search_range,
-                              const vp9_variance_fn_ptr_t *fn_ptr,
-                              const MV *center_mv) {
+int vp9_refining_search_sad(const MACROBLOCK *x,
+                            MV *ref_mv, int error_per_bit,
+                            int search_range,
+                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   const struct buf_2d *const what = &x->plane[0].src;
@@ -2029,7 +2317,7 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x,
                                    1, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default:
-      assert(!"Invalid search method.");
+      assert(0 && "Invalid search method.");
   }
 
   if (method != NSTEP && rd && var < var_max)
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
index 9ddca250c7c..dd8a4607942 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.h
@@ -66,7 +66,13 @@ struct SPEED_FEATURES;
 
 int vp9_init_search_range(int size);
 
-// Runs sequence of diamond searches in smaller steps for RD
+int vp9_refining_search_sad(const struct macroblock *x,
+                            struct mv *ref_mv,
+                            int sad_per_bit, int distance,
+                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct mv *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
 int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
@@ -74,6 +80,11 @@ int vp9_full_pixel_diamond(const struct VP9_COMP *cpi, MACROBLOCK *x,
                            const vp9_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
+// Perform integral projection based motion estimation.
+unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
+                                           MACROBLOCK *x,
+                                           BLOCK_SIZE bsize);
+
 typedef int (integer_mv_pattern_search_fn) (
     const MACROBLOCK *x,
     MV *ref_mv,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
index 85984fd7ef6..5eb5d542b78 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_picklpf.c
@@ -33,16 +33,23 @@ static int get_max_filter_level(const VP9_COMP *cpi) {
 }
 
 
-static int try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMP *const cpi,
-                            int filt_level, int partial_frame) {
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+                                VP9_COMP *const cpi,
+                                int filt_level, int partial_frame) {
   VP9_COMMON *const cm = &cpi->common;
-  int filt_err;
+  int64_t filt_err;
+
+  if (cpi->num_workers > 1)
+    vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
+                             filt_level, 1, partial_frame,
+                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+  else
+    vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                          1, partial_frame);
 
-  vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->mb.e_mbd, filt_level, 1,
-                        partial_frame);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show, cm->bit_depth);
+    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
   } else {
     filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
   }
@@ -63,17 +70,18 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   const int min_filter_level = 0;
   const int max_filter_level = get_max_filter_level(cpi);
   int filt_direction = 0;
-  int best_err, filt_best;
+  int64_t best_err;
+  int filt_best;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
   int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
-  int ss_err[MAX_LOOP_FILTER + 1];
+  int64_t ss_err[MAX_LOOP_FILTER + 1];
 
   // Set each entry to -1
-  vpx_memset(ss_err, 0xFF, sizeof(ss_err));
+  memset(ss_err, 0xFF, sizeof(ss_err));
 
   //  Make a copy of the unfiltered / processed recon buffer
   vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
@@ -87,7 +95,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
 
     // Bias against raising loop filter in favor of lowering it.
-    int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+    int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
     if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
       bias = (bias * cpi->twopass.section_intra_rating) / 20;
@@ -153,7 +161,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level, filt_guess = q * 0.316206 + 3.87252
-#if CONFIG_VP9_HIGHDEPTH
+#if CONFIG_VP9_HIGHBITDEPTH
     int filt_guess;
     switch (cm->bit_depth) {
       case VPX_BITS_8:
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
index b74b2dd56a2..9fb7cfba7bf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c
@@ -20,9 +20,11 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+#include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -49,7 +51,7 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
   int const_motion = 0;
 
   // Blank the reference vector list
-  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
 
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
@@ -58,14 +60,15 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
-                                                   xd->mi_stride].src_mi;
+                                                   xd->mi_stride];
       const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
       // Keep counts for entropy encoding.
       context_counter += mode_2_counter[candidate->mode];
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1));
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, -1),
+                        refmv_count, mv_ref_list, Done);
     }
   }
 
@@ -78,11 +81,11 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     const POSITION *const mv_ref = &mv_ref_search[i];
     if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
       const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
-                                                    xd->mi_stride].src_mi->mbmi;
+                                                    xd->mi_stride]->mbmi;
       different_ref_found = 1;
 
       if (candidate->ref_frame[0] == ref_frame)
-        ADD_MV_REF_LIST(candidate->mv[0]);
+        ADD_MV_REF_LIST(candidate->mv[0], refmv_count, mv_ref_list, Done);
     }
   }
 
@@ -94,10 +97,11 @@ static int mv_refs_rt(const VP9_COMMON *cm, const MACROBLOCKD *xd,
       const POSITION *mv_ref = &mv_ref_search[i];
       if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
         const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
-                                              * xd->mi_stride].src_mi->mbmi;
+                                              * xd->mi_stride]->mbmi;
 
         // If the candidate is INTRA we don't want to consider its mv.
-        IF_DIFF_REF_FRAME_ADD_MV(candidate);
+        IF_DIFF_REF_FRAME_ADD_MV(candidate, ref_frame, ref_sign_bias,
+                                 refmv_count, mv_ref_list, Done);
       }
     }
   }
@@ -118,7 +122,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                   int_mv *tmp_mv, int *rate_mv,
                                   int64_t best_rd_sofar) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
   const int step_param = cpi->sf.mv.fullpel_search_step_param;
   const int sadpb = x->sadperbit16;
@@ -135,10 +139,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int cost_list[5];
   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
                                                                         ref);
-  if (cpi->common.show_frame &&
-      (x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[LAST_FRAME])
-    return rv;
-
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
@@ -190,7 +190,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  cond_cost_list(cpi, cost_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
-    x->pred_mv[ref] = tmp_mv->as_mv;
+    *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
 
   if (scaled_ref_frame) {
@@ -201,6 +202,250 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   return rv;
 }
 
+static void block_variance(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           int w, int h, unsigned int *sse, int *sum,
+                           int block_size, unsigned int *sse8x8,
+                           int *sum8x8, unsigned int *var8x8) {
+  int i, j, k = 0;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      vp9_get8x8var(src + src_stride * i + j, src_stride,
+                    ref + ref_stride * i + j, ref_stride,
+                    &sse8x8[k], &sum8x8[k]);
+      *sse += sse8x8[k];
+      *sum += sum8x8[k];
+      var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+      k++;
+    }
+  }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+                               unsigned int *sse_i, int *sum_i,
+                               unsigned int *var_o, unsigned int *sse_o,
+                               int *sum_o) {
+  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+  int i, j, k = 0;
+
+  for (i = 0; i < nh; i += 2) {
+    for (j = 0; j < nw; j += 2) {
+      sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+          sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+      sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+          sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+      var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+          (b_width_log2_lookup[unit_size] +
+              b_height_log2_lookup[unit_size] + 6));
+      k++;
+    }
+  }
+}
+
+static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                                    MACROBLOCK *x, MACROBLOCKD *xd,
+                                    int *out_rate_sum, int64_t *out_dist_sum,
+                                    unsigned int *var_y, unsigned int *sse_y,
+                                    int mi_row, int mi_col, int *early_term) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const uint32_t dc_quant = pd->dequant[0];
+  const uint32_t ac_quant = pd->dequant[1];
+  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  const int64_t ac_thr = ac_quant * ac_quant >> 6;
+  unsigned int var;
+  int sum;
+  int skip_dc = 0;
+
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  const int num8x8 = 1 << (bw + bh - 2);
+  unsigned int sse8x8[64] = {0};
+  int sum8x8[64] = {0};
+  unsigned int var8x8[64] = {0};
+  TX_SIZE tx_size;
+  int i, k;
+
+  // Calculate variance for whole partition, and also save 8x8 blocks' variance
+  // to be used in following transform skipping test.
+  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+  var = sse - (((int64_t)sum * sum) >> (bw + bh + 4));
+
+  *var_y = var;
+  *sse_y = sse;
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = MIN(max_txsize_lookup[bsize],
+                    tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+          cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
+        tx_size = TX_8X8;
+      else if (tx_size > TX_16X16)
+        tx_size = TX_16X16;
+    }
+  } else {
+    tx_size = MIN(max_txsize_lookup[bsize],
+                  tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
+  assert(tx_size >= TX_8X8);
+  xd->mi[0]->mbmi.tx_size = tx_size;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    unsigned int sse16x16[16] = {0};
+    int sum16x16[16] = {0};
+    unsigned int var16x16[16] = {0};
+    const int num16x16 = num8x8 >> 2;
+
+    unsigned int sse32x32[4] = {0};
+    int sum32x32[4] = {0};
+    unsigned int var32x32[4] = {0};
+    const int num32x32 = num8x8 >> 4;
+
+    int ac_test = 1;
+    int dc_test = 1;
+    const int num = (tx_size == TX_8X8) ? num8x8 :
+        ((tx_size == TX_16X16) ? num16x16 : num32x32);
+    const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 :
+        ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+    const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 :
+        ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+    // Calculate variance if tx_size > TX_8X8
+    if (tx_size >= TX_16X16)
+      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+                         sum16x16);
+    if (tx_size == TX_32X32)
+      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+                         sse32x32, sum32x32);
+
+    // Skipping test
+    x->skip_txfm[0] = 0;
+    for (k = 0; k < num; k++)
+      // Check if all ac coefficients can be quantized to zero.
+      if (!(var_tx[k] < ac_thr || var == 0)) {
+        ac_test = 0;
+        break;
+      }
+
+    for (k = 0; k < num; k++)
+      // Check if dc coefficient can be quantized to zero.
+      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+        dc_test = 0;
+        break;
+      }
+
+    if (ac_test) {
+      x->skip_txfm[0] = 2;
+
+      if (dc_test)
+        x->skip_txfm[0] = 1;
+    } else if (dc_test) {
+      skip_dc = 1;
+    }
+  }
+
+  if (x->skip_txfm[0] == 1) {
+    int skip_uv[2] = {0};
+    unsigned int var_uv[2];
+    unsigned int sse_uv[2];
+
+    *out_rate_sum = 0;
+    *out_dist_sum = sse << 4;
+
+    // Transform skipping test in UV planes.
+    for (i = 1; i <= 2; i++) {
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0]->mbmi, pd);
+      const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const int uv_bw = b_width_log2_lookup[uv_bsize];
+      const int uv_bh = b_height_log2_lookup[uv_bsize];
+      const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+          (uv_bh - b_height_log2_lookup[unit_size]);
+      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      int j = i - 1;
+
+      vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride,
+          pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+      if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+          (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+        skip_uv[j] = 1;
+      else
+        break;
+    }
+
+    // If the transform in YUV planes are skippable, the mode search checks
+    // fewer inter modes and doesn't check intra modes.
+    if (skip_uv[0] & skip_uv[1]) {
+      *early_term = 1;
+    }
+
+    return;
+  }
+
+  if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+#else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3, &rate, &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *out_rate_sum += rate;
+  *out_dist_sum += dist << 4;
+}
 
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
@@ -214,79 +459,298 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int64_t dist;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
+  const int64_t dc_thr = p->quant_thred[0] >> 6;
+  const int64_t ac_thr = p->quant_thred[1] >> 6;
   const uint32_t dc_quant = pd->dequant[0];
   const uint32_t ac_quant = pd->dequant[1];
   unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
                                            pd->dst.buf, pd->dst.stride, &sse);
+  int skip_dc = 0;
+
   *var_y = var;
   *sse_y = sse;
 
-  if (sse < dc_quant * dc_quant >> 6)
-    x->skip_txfm[0] = 1;
-  else if (var < ac_quant * ac_quant >> 6)
-    x->skip_txfm[0] = 2;
-  else
-    x->skip_txfm[0] = 0;
-
   if (cpi->common.tx_mode == TX_MODE_SELECT) {
     if (sse > (var << 2))
-      xd->mi[0].src_mi->mbmi.tx_size =
+      xd->mi[0]->mbmi.tx_size =
           MIN(max_txsize_lookup[bsize],
               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
     else
-      xd->mi[0].src_mi->mbmi.tx_size = TX_8X8;
-
-    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
-        xd->mi[0].src_mi->mbmi.tx_size > TX_16X16)
-      xd->mi[0].src_mi->mbmi.tx_size = TX_16X16;
+      xd->mi[0]->mbmi.tx_size = TX_8X8;
+
+    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+          cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id))
+        xd->mi[0]->mbmi.tx_size = TX_8X8;
+      else if (xd->mi[0]->mbmi.tx_size > TX_16X16)
+        xd->mi[0]->mbmi.tx_size = TX_16X16;
+    }
   } else {
-    xd->mi[0].src_mi->mbmi.tx_size =
+    xd->mi[0]->mbmi.tx_size =
         MIN(max_txsize_lookup[bsize],
             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
   }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
-                                 dc_quant >> (xd->bd - 5), &rate, &dist);
-  } else {
-    vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
-                                 dc_quant >> 3, &rate, &dist);
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    const BLOCK_SIZE unit_size =
+        txsize_to_bsize[xd->mi[0]->mbmi.tx_size];
+    const unsigned int num_blk_log2 =
+        (b_width_log2_lookup[bsize] - b_width_log2_lookup[unit_size]) +
+        (b_height_log2_lookup[bsize] - b_height_log2_lookup[unit_size]);
+    const unsigned int sse_tx = sse >> num_blk_log2;
+    const unsigned int var_tx = var >> num_blk_log2;
+
+    x->skip_txfm[0] = 0;
+    // Check if all ac coefficients can be quantized to zero.
+    if (var_tx < ac_thr || var == 0) {
+      x->skip_txfm[0] = 2;
+      // Check if dc coefficient can be quantized to zero.
+      if (sse_tx - var_tx < dc_thr || sse == var)
+        x->skip_txfm[0] = 1;
+    } else {
+      if (sse_tx - var_tx < dc_thr || sse == var)
+        skip_dc = 1;
+    }
   }
+
+  if (x->skip_txfm[0] == 1) {
+    *out_rate_sum = 0;
+    *out_dist_sum = sse << 4;
+    return;
+  }
+
+  if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
 #else
-  vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize],
-                               dc_quant >> 3, &rate, &dist);
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 
-  *out_rate_sum = rate >> 1;
-  *out_dist_sum = dist << 3;
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_model_rd_from_var_lapndz(var,
-                                 1 << num_pels_log2_lookup[bsize],
-                                 ac_quant >> (xd->bd - 5),
-                                 &rate,
-                                 &dist);
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5), &rate, &dist);
   } else {
-    vp9_model_rd_from_var_lapndz(var,
-                                 1 << num_pels_log2_lookup[bsize],
-                                 ac_quant >> 3,
-                                 &rate,
-                                 &dist);
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3, &rate, &dist);
   }
 #else
-  vp9_model_rd_from_var_lapndz(var,
-                               1 << num_pels_log2_lookup[bsize],
-                               ac_quant >> 3,
-                               &rate,
-                               &dist);
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   *out_rate_sum += rate;
   *out_dist_sum += dist << 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var_y, sse_y;
+  (void)plane;
+  (void)tx_size;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+}
+#else
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  int block = 0, r, c;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+  int eob_cost = 0;
+
+  (void)cpi;
+  vp9_subtract_plane(x, bsize, plane);
+  *skippable = 1;
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        const int16_t *src_diff;
+        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+        switch (tx_size) {
+          case TX_32X32:
+            vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
+            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                  p->round_fp, p->quant_fp, p->quant_shift,
+                                  qcoeff, dqcoeff, pd->dequant, eob,
+                                  scan_order->scan, scan_order->iscan);
+            break;
+          case TX_16X16:
+            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_8X8:
+            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_4X4:
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          default:
+            assert(0);
+            break;
+        }
+        *skippable &= (*eob == 0);
+        eob_cost += 1;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable && *sse < INT64_MAX) {
+    *rate = 0;
+    *dist = (*sse << 6) >> shift;
+    *sse = *dist;
+    return;
+  }
+
+  block = 0;
+  *rate = 0;
+  *dist = 0;
+  *sse = (*sse << 6) >> shift;
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+
+        if (*eob == 1)
+          *rate += (int)abs(qcoeff[0]);
+        else if (*eob > 1)
+          *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+
+        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable == 0) {
+    *rate <<= 10;
+    *rate += (eob_cost << 8);
+  }
+}
+#endif
+
+static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                               MACROBLOCK *x, MACROBLOCKD *xd,
+                               int *out_rate_sum, int64_t *out_dist_sum,
+                               unsigned int *var_y, unsigned int *sse_y) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  int i;
+
+  *out_rate_sum = 0;
+  *out_dist_sum = 0;
+
+  for (i = 1; i <= 2; ++i) {
+    struct macroblock_plane *const p = &x->plane[i];
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const uint32_t dc_quant = pd->dequant[0];
+    const uint32_t ac_quant = pd->dequant[1];
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+    unsigned int var;
+
+    if (!x->color_sensitivity[i - 1])
+      continue;
+
+    var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                             pd->dst.buf, pd->dst.stride, &sse);
+    *var_y += var;
+    *sse_y += sse;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bs],
+                                 dc_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate >> 1;
+    *out_dist_sum += dist << 3;
+
+  #if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                   ac_quant >> 3, &rate, &dist);
+    }
+  #else
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bs],
+                                 ac_quant >> 3, &rate, &dist);
+  #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    *out_rate_sum += rate;
+    *out_dist_sum += dist << 4;
+  }
+}
+
 static int get_pred_buffer(PRED_BUFFER *p, int len) {
   int i;
 
@@ -312,7 +776,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
                                  struct buf_2d yv12_mb[][MAX_MB_PLANE],
                                  int *rate, int64_t *dist) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
 
   const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
   unsigned int var = var_y, sse = sse_y;
@@ -329,11 +793,11 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
     const unsigned int min_thresh =
         MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
 #if CONFIG_VP9_HIGHBITDEPTH
-    const int shift = 2 * xd->bd - 16;
+    const int shift = (xd->bd << 1) - 16;
 #endif
 
     // Calculate threshold according to dequant value.
-    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+    thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) >> 3;
 #if CONFIG_VP9_HIGHBITDEPTH
     if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && shift > 0) {
       thresh_ac = ROUND_POWER_OF_TWO(thresh_ac, shift);
@@ -375,14 +839,14 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x,
                                     xd->plane[1].dst.stride, &sse_u);
 
     // U skipping condition checking
-    if ((var_u * 4 <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
+    if (((var_u << 2) <= thresh_ac) && (sse_u - var_u <= thresh_dc)) {
       var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
                                       x->plane[2].src.stride,
                                       xd->plane[2].dst.buf,
                                       xd->plane[2].dst.stride, &sse_v);
 
       // V skipping condition checking
-      if ((var_v * 4 <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
+      if (((var_v << 2) <= thresh_ac) && (sse_v - var_v <= thresh_dc)) {
         x->skip = 1;
 
         // The cost of skip bit needs to be added.
@@ -428,7 +892,9 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   int i, j;
   int rate;
   int64_t dist;
-  unsigned int var_y, sse_y;
+  int64_t this_sse = INT64_MAX;
+  int is_skippable;
+
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   assert(plane == 0);
   (void) plane;
@@ -439,67 +905,189 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   vp9_predict_intra_block(xd, block >> (2 * tx_size),
                           b_width_log2_lookup[plane_bsize],
                           tx_size, args->mode,
-                          p->src.buf, src_stride,
+                          x->skip_encode ? p->src.buf : pd->dst.buf,
+                          x->skip_encode ? src_stride : dst_stride,
                           pd->dst.buf, dst_stride,
                           i, j, 0);
-  // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
-  model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+
+  // TODO(jingning): This needs further refactoring.
+  block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+            bsize_tx, MIN(tx_size, TX_16X16));
+  x->skip_txfm[0] = is_skippable;
+  rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
   args->rate += rate;
   args->dist += dist;
 }
 
-static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][INTER_MODES] = {
+static const THR_MODES mode_idx[MAX_REF_FRAMES - 1][4] = {
+  {THR_DC, THR_V_PRED, THR_H_PRED, THR_TM},
   {THR_NEARESTMV, THR_NEARMV, THR_ZEROMV, THR_NEWMV},
   {THR_NEARESTG, THR_NEARG, THR_ZEROG, THR_NEWG},
-  {THR_NEARESTA, THR_NEARA, THR_ZEROA, THR_NEWA},
+};
+
+static const PREDICTION_MODE intra_mode_list[] = {
+  DC_PRED, V_PRED, H_PRED, TM_PRED
+};
+
+static int mode_offset(const PREDICTION_MODE mode) {
+  if (mode >= NEARESTMV) {
+    return INTER_OFFSET(mode);
+  } else {
+    switch (mode) {
+      case DC_PRED:
+        return 0;
+      case V_PRED:
+        return 1;
+      case H_PRED:
+        return 2;
+      case TM_PRED:
+        return 3;
+      default:
+        return -1;
+    }
+  }
+}
+
+static INLINE void update_thresh_freq_fact(VP9_COMP *cpi,
+                                           TileDataEnc *tile_data,
+                                           BLOCK_SIZE bsize,
+                                           MV_REFERENCE_FRAME ref_frame,
+                                           THR_MODES best_mode_idx,
+                                           PREDICTION_MODE mode) {
+  THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
+  int *freq_fact = &tile_data->thresh_freq_fact[bsize][thr_mode_idx];
+  if (thr_mode_idx == best_mode_idx)
+    *freq_fact -= (*freq_fact >> 4);
+  else
+    *freq_fact = MIN(*freq_fact + RD_THRESH_INC,
+        cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
+}
+
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  RD_COST this_rdc, best_rdc;
+  PREDICTION_MODE this_mode;
+  struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
+  const TX_SIZE intra_tx_size =
+      MIN(max_txsize_lookup[bsize],
+          tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  MODE_INFO *const mic = xd->mi[0];
+  int *bmode_costs;
+  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
+  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
+  const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
+  const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
+  bmode_costs = cpi->y_mode_costs[A][L];
+
+  (void) ctx;
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(&this_rdc);
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->mv[0].as_int = INVALID_MV;
+  mbmi->uv_mode = DC_PRED;
+  memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+
+  // Change the limit of this loop to add other intra prediction
+  // mode tests.
+  for (this_mode = DC_PRED; this_mode <= H_PRED; ++this_mode) {
+    args.mode = this_mode;
+    args.rate = 0;
+    args.dist = 0;
+    mbmi->tx_size = intra_tx_size;
+    vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                           estimate_block_intra, &args);
+    this_rdc.rate = args.rate;
+    this_rdc.dist = args.dist;
+    this_rdc.rate += bmode_costs[this_mode];
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                             this_rdc.rate, this_rdc.dist);
+
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      mbmi->mode = this_mode;
+    }
+  }
+
+  *rd_cost = best_rdc;
+}
+
+static void init_ref_frame_cost(VP9_COMMON *const cm,
+                                MACROBLOCKD *const xd,
+                                int ref_frame_cost[MAX_REF_FRAMES]) {
+  vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+  vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+  vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+  ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+  ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+    ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+  ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+}
+
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame;
+  PREDICTION_MODE pred_mode;
+} REF_MODE;
+
+#define RT_INTER_MODES 8
+static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
+    {LAST_FRAME, ZEROMV},
+    {LAST_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, ZEROMV},
+    {LAST_FRAME, NEARMV},
+    {LAST_FRAME, NEWMV},
+    {GOLDEN_FRAME, NEARESTMV},
+    {GOLDEN_FRAME, NEARMV},
+    {GOLDEN_FRAME, NEWMV}
 };
 
 // TODO(jingning) placeholder for inter-frame non-RD mode decision.
 // this needs various further optimizations. to be continued..
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                         const TileInfo *const tile,
-                         int mi_row, int mi_col,
-                         int *returnrate,
-                         int64_t *returndistortion,
-                         BLOCK_SIZE bsize,
-                         PICK_MODE_CONTEXT *ctx) {
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   struct macroblockd_plane *const pd = &xd->plane[0];
   PREDICTION_MODE best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
-  TX_SIZE best_tx_size = MIN(max_txsize_lookup[bsize],
-                             tx_mode_to_biggest_tx_size[cm->tx_mode]);
+  MV_REFERENCE_FRAME usable_ref_frame;
+  TX_SIZE best_tx_size = TX_SIZES;
   INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
-  int64_t best_rd = INT64_MAX;
-  int64_t this_rd = INT64_MAX;
-  uint8_t skip_txfm = 0;
-  int rate = INT_MAX;
-  int64_t dist = INT64_MAX;
+  RD_COST this_rdc, best_rdc;
+  uint8_t skip_txfm = 0, best_mode_skip_txfm = 0;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
   // Reduce the intra cost penalty for small blocks (<=16x16).
   const int reduction_fac =
       (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
-       bsize <= BLOCK_16X16) ? 4 : 1;
+       bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
   const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) / reduction_fac;
+      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth) >> reduction_fac;
   const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv,
                                            intra_cost_penalty, 0);
-  const int intra_mode_cost = 50;
-
-  const int8_t segment_id = mbmi->segment_id;
-  const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = cpi->rd.thresh_freq_fact[bsize];
-  INTERP_FILTER filter_ref = cm->interp_filter;
+  const int *const rd_threshes = cpi->rd.threshes[mbmi->segment_id][bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
+  INTERP_FILTER filter_ref;
   const int bsl = mi_width_log2_lookup[bsize];
   const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
       (((mi_row + mi_col) >> bsl) +
@@ -511,16 +1099,24 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // process.
   // tmp[3] points to dst buffer, and the other 3 point to allocated buffers.
   PRED_BUFFER tmp[4];
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, pred_buf, 3 * 64 * 64);
+  DECLARE_ALIGNED(16, uint8_t, pred_buf[3 * 64 * 64]);
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, pred_buf_16, 3 * 64 * 64);
+  DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
 #endif
   struct buf_2d orig_dst = pd->dst;
   PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
   const int pixels_in_block = bh * bw;
+  int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
+  int ref_frame_skip_mask = 0;
+  int idx;
+  int best_pred_sad = INT_MAX;
+  int best_early_term = 0;
+  int ref_frame_cost[MAX_REF_FRAMES];
+
+  init_ref_frame_cost(cm, xd, ref_frame_cost);
 
-  if (cpi->sf.reuse_inter_pred_sby) {
+  if (reuse_inter_pred) {
     int i;
     for (i = 0; i < 3; i++) {
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -542,42 +1138,52 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   x->skip = 0;
 
+  if (xd->up_available)
+    filter_ref = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+  else if (xd->left_available)
+    filter_ref = xd->mi[-1]->mbmi.interp_filter;
+  else
+    filter_ref = cm->interp_filter;
+
   // initialize mode decisions
-  *returnrate = INT_MAX;
-  *returndistortion = INT64_MAX;
-  vpx_memset(mbmi, 0, sizeof(MB_MODE_INFO));
+  vp9_rd_cost_reset(&best_rdc);
+  vp9_rd_cost_reset(rd_cost);
   mbmi->sb_type = bsize;
   mbmi->ref_frame[0] = NONE;
   mbmi->ref_frame[1] = NONE;
   mbmi->tx_size = MIN(max_txsize_lookup[bsize],
                       tx_mode_to_biggest_tx_size[cm->tx_mode]);
-  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ?
-                        EIGHTTAP : cm->interp_filter;
-  mbmi->segment_id = segment_id;
 
-  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    PREDICTION_MODE this_mode;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  vp9_denoiser_reset_frame_stats(ctx);
+#endif
+
+  if (cpi->rc.frames_since_golden == 0) {
+    usable_ref_frame = LAST_FRAME;
+  } else {
+    usable_ref_frame = GOLDEN_FRAME;
+  }
+
+  for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+
     x->pred_mv_sad[ref_frame] = INT_MAX;
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
 
-    if (xd->up_available)
-      filter_ref = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
-    else if (xd->left_available)
-      filter_ref = xd->mi[-1].src_mi->mbmi.interp_filter;
-
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
       int_mv *const candidates = mbmi->ref_mvs[ref_frame];
       const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
                            sf, sf);
 
-      if (!cm->error_resilient_mode)
-        vp9_find_mv_refs(cm, xd, tile, xd->mi[0].src_mi, ref_frame,
-                         candidates, mi_row, mi_col);
+      if (cm->use_prev_frame_mvs)
+        vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
+                         candidates, mi_row, mi_col, NULL, NULL);
       else
-        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile, xd->mi[0].src_mi,
+        const_motion[ref_frame] = mv_refs_rt(cm, xd, tile_info,
+                                             xd->mi[0],
                                              ref_frame, candidates,
                                              mi_row, mi_col);
 
@@ -589,195 +1195,288 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
                     ref_frame, bsize);
     } else {
-      continue;
+      ref_frame_skip_mask |= (1 << ref_frame);
     }
+  }
 
-    // Select prediction reference frames.
-    xd->plane[0].pre[0] = yv12_mb[ref_frame][0];
+  for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+    int rate_mv = 0;
+    int mode_rd_thresh;
+    int mode_index;
+    int i;
+    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    int64_t this_sse;
+    int is_skippable;
+    int this_early_term = 0;
+
+    if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
+      continue;
+
+    ref_frame = ref_mode_set[idx].ref_frame;
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame]))
+      continue;
+    if (const_motion[ref_frame] && this_mode == NEARMV)
+      continue;
 
-    clamp_mv2(&frame_mv[NEARESTMV][ref_frame].as_mv, xd);
-    clamp_mv2(&frame_mv[NEARMV][ref_frame].as_mv, xd);
+    i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+    if (cpi->ref_frame_flags & flag_list[i])
+      if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+        ref_frame_skip_mask |= (1 << ref_frame);
+    if (ref_frame_skip_mask & (1 << ref_frame))
+      continue;
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
 
     mbmi->ref_frame[0] = ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, NONE);
 
-    for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
-      int rate_mv = 0;
-      int mode_rd_thresh;
+    mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
+    mode_rd_thresh = best_mode_skip_txfm ?
+            rd_threshes[mode_index] << 1 : rd_threshes[mode_index];
+    if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                            rd_thresh_freq_fact[mode_index]))
+      continue;
 
-      if (const_motion[ref_frame] &&
-          (this_mode == NEARMV || this_mode == ZEROMV))
-        continue;
+    if (this_mode == NEWMV) {
+      if (ref_frame > LAST_FRAME) {
+        int tmp_sad;
+        int dis, cost_list[5];
 
-      if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
-        continue;
+        if (bsize < BLOCK_16X16)
+          continue;
 
-      mode_rd_thresh =
-          rd_threshes[mode_idx[ref_frame -
-                               LAST_FRAME][INTER_OFFSET(this_mode)]];
-      if (rd_less_than_thresh(best_rd, mode_rd_thresh,
-                              rd_thresh_freq_fact[this_mode]))
-        continue;
+        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
 
-      if (this_mode == NEWMV) {
-        if (cpi->sf.partition_search_type != VAR_BASED_PARTITION &&
-            this_rd < (int64_t)(1 << num_pels_log2_lookup[bsize]))
+        if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
           continue;
-        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                    &frame_mv[NEWMV][ref_frame],
-                                    &rate_mv, best_rd))
+        if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
           continue;
-      }
 
-      if (this_mode != NEARESTMV &&
-          frame_mv[this_mode][ref_frame].as_int ==
-              frame_mv[NEARESTMV][ref_frame].as_int)
+        frame_mv[NEWMV][ref_frame].as_int = mbmi->mv[0].as_int;
+        rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+          &mbmi->ref_mvs[ref_frame][0].as_mv,
+          x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+        frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+        frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+        cpi->find_fractional_mv_step(x, &frame_mv[NEWMV][ref_frame].as_mv,
+          &mbmi->ref_mvs[ref_frame][0].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          cpi->sf.mv.subpel_force_stop,
+          cpi->sf.mv.subpel_iters_per_step,
+          cond_cost_list(cpi, cost_list),
+          x->nmvjointcost, x->mvcost, &dis,
+          &x->pred_sse[ref_frame], NULL, 0, 0);
+      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+        &frame_mv[NEWMV][ref_frame], &rate_mv, best_rdc.rdcost)) {
         continue;
+      }
+    }
 
-      mbmi->mode = this_mode;
-      mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
-
-      // Search for the best prediction filter type, when the resulting
-      // motion vector is at sub-pixel accuracy level for luma component, i.e.,
-      // the last three bits are all zeros.
-      if (cpi->sf.reuse_inter_pred_sby) {
-        if (this_mode == NEARESTMV) {
-          this_mode_pred = &tmp[3];
-        } else {
-          this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
-          pd->dst.buf = this_mode_pred->data;
-          pd->dst.stride = bw;
-        }
+    if (this_mode == NEWMV && ref_frame == LAST_FRAME &&
+        frame_mv[NEWMV][LAST_FRAME].as_int != INVALID_MV) {
+      const int pre_stride = xd->plane[0].pre[0].stride;
+      const uint8_t * const pre_buf = xd->plane[0].pre[0].buf +
+          (frame_mv[NEWMV][LAST_FRAME].as_mv.row >> 3) * pre_stride +
+          (frame_mv[NEWMV][LAST_FRAME].as_mv.col >> 3);
+      best_pred_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf,
+                                   x->plane[0].src.stride,
+                                   pre_buf, pre_stride);
+      x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
+    }
+
+    if (this_mode != NEARESTMV &&
+        frame_mv[this_mode][ref_frame].as_int ==
+            frame_mv[NEARESTMV][ref_frame].as_int)
+      continue;
+
+    mbmi->mode = this_mode;
+    mbmi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+
+    // Search for the best prediction filter type, when the resulting
+    // motion vector is at sub-pixel accuracy level for luma component, i.e.,
+    // the last three bits are all zeros.
+    if (reuse_inter_pred) {
+      if (!this_mode_pred) {
+        this_mode_pred = &tmp[3];
+      } else {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = bw;
       }
+    }
+
+    if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search
+        && (ref_frame == LAST_FRAME)
+        && (((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07) != 0)) {
+      int pf_rate[3];
+      int64_t pf_dist[3];
+      unsigned int pf_var[3];
+      unsigned int pf_sse[3];
+      TX_SIZE pf_tx_size[3];
+      int64_t best_cost = INT64_MAX;
+      INTERP_FILTER best_filter = SWITCHABLE, filter;
+      PRED_BUFFER *current_pred = this_mode_pred;
+
+      for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
+        int64_t cost;
+        mbmi->interp_filter = filter;
+        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+                          &pf_var[filter], &pf_sse[filter]);
+        pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+        cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+        pf_tx_size[filter] = mbmi->tx_size;
+        if (cost < best_cost) {
+          best_filter = filter;
+          best_cost = cost;
+          skip_txfm = x->skip_txfm[0];
+
+          if (reuse_inter_pred) {
+            if (this_mode_pred != current_pred) {
+              free_pred_buffer(this_mode_pred);
+              this_mode_pred = current_pred;
+            }
 
-      if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
-          pred_filter_search &&
-          ((mbmi->mv[0].as_mv.row & 0x07) != 0 ||
-           (mbmi->mv[0].as_mv.col & 0x07) != 0)) {
-        int pf_rate[3];
-        int64_t pf_dist[3];
-        unsigned int pf_var[3];
-        unsigned int pf_sse[3];
-        TX_SIZE pf_tx_size[3];
-        int64_t best_cost = INT64_MAX;
-        INTERP_FILTER best_filter = SWITCHABLE, filter;
-        PRED_BUFFER *current_pred = this_mode_pred;
-
-        for (filter = EIGHTTAP; filter <= EIGHTTAP_SHARP; ++filter) {
-          int64_t cost;
-          mbmi->interp_filter = filter;
-          vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter],
-                            &pf_dist[filter], &pf_var[filter], &pf_sse[filter]);
-          cost = RDCOST(x->rdmult, x->rddiv,
-                        vp9_get_switchable_rate(cpi) + pf_rate[filter],
-                        pf_dist[filter]);
-          pf_tx_size[filter] = mbmi->tx_size;
-          if (cost < best_cost) {
-            best_filter = filter;
-            best_cost = cost;
-            skip_txfm = x->skip_txfm[0];
-
-            if (cpi->sf.reuse_inter_pred_sby) {
-              if (this_mode_pred != current_pred) {
-                free_pred_buffer(this_mode_pred);
-                this_mode_pred = current_pred;
-              }
-
-              if (filter < EIGHTTAP_SHARP) {
-                current_pred = &tmp[get_pred_buffer(tmp, 3)];
-                pd->dst.buf = current_pred->data;
-                pd->dst.stride = bw;
-              }
+            if (filter < EIGHTTAP_SHARP) {
+              current_pred = &tmp[get_pred_buffer(tmp, 3)];
+              pd->dst.buf = current_pred->data;
+              pd->dst.stride = bw;
             }
           }
         }
+      }
 
-        if (cpi->sf.reuse_inter_pred_sby && this_mode_pred != current_pred)
-          free_pred_buffer(current_pred);
-
-        mbmi->interp_filter = best_filter;
-        mbmi->tx_size = pf_tx_size[mbmi->interp_filter];
-        rate = pf_rate[mbmi->interp_filter];
-        dist = pf_dist[mbmi->interp_filter];
-        var_y = pf_var[mbmi->interp_filter];
-        sse_y = pf_sse[mbmi->interp_filter];
-        x->skip_txfm[0] = skip_txfm;
+      if (reuse_inter_pred && this_mode_pred != current_pred)
+        free_pred_buffer(current_pred);
+
+      mbmi->interp_filter = best_filter;
+      mbmi->tx_size = pf_tx_size[best_filter];
+      this_rdc.rate = pf_rate[best_filter];
+      this_rdc.dist = pf_dist[best_filter];
+      var_y = pf_var[best_filter];
+      sse_y = pf_sse[best_filter];
+      x->skip_txfm[0] = skip_txfm;
+      if (reuse_inter_pred) {
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = this_mode_pred->stride;
+      }
+    } else {
+      mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
+      vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+
+      // For large partition blocks, extra testing is done.
+      if (bsize > BLOCK_32X32 &&
+        !cyclic_refresh_segment_id_boosted(xd->mi[0]->mbmi.segment_id) &&
+        cm->base_qindex) {
+        model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
+                                &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
+                                &this_early_term);
       } else {
-        mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
-        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &var_y, &sse_y);
       }
+    }
 
-      rate += rate_mv;
-      rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
-                                  [INTER_OFFSET(this_mode)];
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
-
-      // Skipping checking: test to see if this block can be reconstructed by
-      // prediction only.
-      if (cpi->allow_encode_breakout) {
-        encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame,
-                             this_mode, var_y, sse_y, yv12_mb, &rate, &dist);
-        if (x->skip) {
-          rate += rate_mv;
-          this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
+    if (!this_early_term) {
+      this_sse = (int64_t)sse_y;
+      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
+                &this_sse, 0, bsize, MIN(mbmi->tx_size, TX_16X16));
+      x->skip_txfm[0] = is_skippable;
+      if (is_skippable) {
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+      } else {
+        if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+            RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+          this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        } else {
+          this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+          this_rdc.dist = this_sse;
+          x->skip_txfm[0] = 1;
         }
       }
 
-#if CONFIG_VP9_TEMPORAL_DENOISING
-      if (cpi->oxcf.noise_sensitivity > 0) {
-        vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+      if (cm->interp_filter == SWITCHABLE) {
+        if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+          this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
       }
-#else
-      (void)ctx;
-#endif
-
-      if (this_rd < best_rd || x->skip) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
-        best_mode = this_mode;
-        best_pred_filter = mbmi->interp_filter;
-        best_tx_size = mbmi->tx_size;
-        best_ref_frame = ref_frame;
-        skip_txfm = x->skip_txfm[0];
+    } else {
+      this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+          vp9_get_switchable_rate(cpi, xd) : 0;
+      this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    }
 
-        if (cpi->sf.reuse_inter_pred_sby) {
-          free_pred_buffer(best_pred);
+    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+      int uv_rate = 0;
+      int64_t uv_dist = 0;
+      if (x->color_sensitivity[0])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
+      if (x->color_sensitivity[1])
+        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+                         &var_y, &sse_y);
+      this_rdc.rate += uv_rate;
+      this_rdc.dist += uv_dist;
+    }
 
-          best_pred = this_mode_pred;
-        }
-      } else {
-        if (cpi->sf.reuse_inter_pred_sby)
-          free_pred_buffer(this_mode_pred);
+    this_rdc.rate += rate_mv;
+    this_rdc.rate +=
+        cpi->inter_mode_cost[mbmi->mode_context[ref_frame]][INTER_OFFSET(
+            this_mode)];
+    this_rdc.rate += ref_frame_cost[ref_frame];
+    this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+
+    // Skipping checking: test to see if this block can be reconstructed by
+    // prediction only.
+    if (cpi->allow_encode_breakout) {
+      encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
+                           var_y, sse_y, yv12_mb, &this_rdc.rate,
+                           &this_rdc.dist);
+      if (x->skip) {
+        this_rdc.rate += rate_mv;
+        this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate,
+                                 this_rdc.dist);
       }
+    }
 
-      if (x->skip)
-        break;
+#if CONFIG_VP9_TEMPORAL_DENOISING
+    if (cpi->oxcf.noise_sensitivity > 0)
+      vp9_denoiser_update_frame_stats(mbmi, sse_y, this_mode, ctx);
+#else
+    (void)ctx;
+#endif
+
+    if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
+      best_rdc = this_rdc;
+      best_mode = this_mode;
+      best_pred_filter = mbmi->interp_filter;
+      best_tx_size = mbmi->tx_size;
+      best_ref_frame = ref_frame;
+      best_mode_skip_txfm = x->skip_txfm[0];
+      best_early_term = this_early_term;
+
+      if (reuse_inter_pred) {
+        free_pred_buffer(best_pred);
+        best_pred = this_mode_pred;
+      }
+    } else {
+      if (reuse_inter_pred)
+        free_pred_buffer(this_mode_pred);
     }
-    // If the current reference frame is valid and we found a usable mode,
-    // we are done.
-    if (best_rd < INT64_MAX)
+
+    if (x->skip)
       break;
-  }
 
-  // If best prediction is not in dst buf, then copy the prediction block from
-  // temp buf to dst buf.
-  if (best_pred != NULL && cpi->sf.reuse_inter_pred_sby &&
-      best_pred->data != orig_dst.buf) {
-    pd->dst = orig_dst;
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (cm->use_highbitdepth) {
-      vp9_highbd_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
-                               NULL, 0, NULL, 0, bw, bh, xd->bd);
-    } else {
-      vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride,
-                        NULL, 0, NULL, 0, bw, bh);
+    // If early termination flag is 1 and at least 2 modes are checked,
+    // the mode search is terminated.
+    if (best_early_term && idx > 0) {
+      x->skip = 1;
+      break;
     }
-#else
-    vp9_convolve_copy(best_pred->data, bw, pd->dst.buf, pd->dst.stride, NULL, 0,
-                      NULL, 0, bw, bh);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   mbmi->mode          = best_mode;
@@ -785,53 +1484,404 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   mbmi->tx_size       = best_tx_size;
   mbmi->ref_frame[0]  = best_ref_frame;
   mbmi->mv[0].as_int  = frame_mv[best_mode][best_ref_frame].as_int;
-  xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
-  x->skip_txfm[0] = skip_txfm;
+  xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+  x->skip_txfm[0] = best_mode_skip_txfm;
 
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
-  if (!x->skip && best_rd > inter_mode_thresh &&
-      bsize <= cpi->sf.max_intra_bsize) {
-    PREDICTION_MODE this_mode;
+  if (best_rdc.rdcost == INT64_MAX ||
+      (!x->skip && best_rdc.rdcost > inter_mode_thresh &&
+       bsize <= cpi->sf.max_intra_bsize)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 0, 0 };
     const TX_SIZE intra_tx_size =
         MIN(max_txsize_lookup[bsize],
             tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    int i;
+    TX_SIZE best_intra_tx_size = TX_SIZES;
 
-    if (cpi->sf.reuse_inter_pred_sby) {
-      pd->dst.buf = tmp[0].data;
-      pd->dst.stride = bw;
+    if (reuse_inter_pred && best_pred != NULL) {
+      if (best_pred->data == orig_dst.buf) {
+        this_mode_pred = &tmp[get_pred_buffer(tmp, 3)];
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+                                   this_mode_pred->data, this_mode_pred->stride,
+                                   NULL, 0, NULL, 0, bw, bh, xd->bd);
+        else
+          vp9_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride,
+                          NULL, 0, NULL, 0, bw, bh);
+#else
+        vp9_convolve_copy(best_pred->data, best_pred->stride,
+                          this_mode_pred->data, this_mode_pred->stride,
+                          NULL, 0, NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        best_pred = this_mode_pred;
+      }
     }
+    pd->dst = orig_dst;
+
+    for (i = 0; i < 4; ++i) {
+      const PREDICTION_MODE this_mode = intra_mode_list[i];
+      THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
+      int mode_rd_thresh = rd_threshes[mode_index];
+
+      if (!((1 << this_mode) & cpi->sf.intra_y_mode_bsize_mask[bsize]))
+        continue;
 
-    for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) {
-      const TX_SIZE saved_tx_size = mbmi->tx_size;
+      if (rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
+                              rd_thresh_freq_fact[mode_index]))
+        continue;
+
+      mbmi->mode = this_mode;
+      mbmi->ref_frame[0] = INTRA_FRAME;
       args.mode = this_mode;
       args.rate = 0;
       args.dist = 0;
       mbmi->tx_size = intra_tx_size;
       vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
                                              estimate_block_intra, &args);
-      mbmi->tx_size = saved_tx_size;
-      rate = args.rate;
-      dist = args.dist;
-      rate += cpi->mbmode_cost[this_mode];
-      rate += intra_cost_penalty;
-      this_rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
-
-      if (this_rd + intra_mode_cost < best_rd) {
-        best_rd = this_rd;
-        *returnrate = rate;
-        *returndistortion = dist;
-        mbmi->mode = this_mode;
-        mbmi->tx_size = intra_tx_size;
-        mbmi->ref_frame[0] = INTRA_FRAME;
+      this_rdc.rate = args.rate;
+      this_rdc.dist = args.dist;
+      this_rdc.rate += cpi->mbmode_cost[this_mode];
+      this_rdc.rate += ref_frame_cost[INTRA_FRAME];
+      this_rdc.rate += intra_cost_penalty;
+      this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                               this_rdc.rate, this_rdc.dist);
+
+      if (this_rdc.rdcost < best_rdc.rdcost) {
+        best_rdc = this_rdc;
+        best_mode = this_mode;
+        best_intra_tx_size = mbmi->tx_size;
+        best_ref_frame = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;
-      } else {
-        x->skip_txfm[0] = skip_txfm;
+        best_mode_skip_txfm = x->skip_txfm[0];
+      }
+    }
+
+    // Reset mb_mode_info to the best inter mode.
+    if (best_ref_frame != INTRA_FRAME) {
+      mbmi->tx_size = best_tx_size;
+    } else {
+      mbmi->tx_size = best_intra_tx_size;
+    }
+  }
+
+  pd->dst = orig_dst;
+  mbmi->mode = best_mode;
+  mbmi->ref_frame[0] = best_ref_frame;
+  x->skip_txfm[0] = best_mode_skip_txfm;
+
+  if (reuse_inter_pred && best_pred != NULL) {
+    if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        vp9_highbd_convolve_copy(best_pred->data, best_pred->stride,
+                                 pd->dst.buf, pd->dst.stride, NULL, 0,
+                                 NULL, 0, bw, bh, xd->bd);
+      else
+        vp9_convolve_copy(best_pred->data, best_pred->stride,
+                          pd->dst.buf, pd->dst.stride, NULL, 0,
+                          NULL, 0, bw, bh);
+#else
+      vp9_convolve_copy(best_pred->data, best_pred->stride,
+                        pd->dst.buf, pd->dst.stride, NULL, 0,
+                        NULL, 0, bw, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  }
+
+  if (cpi->sf.adaptive_rd_thresh) {
+    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mbmi->mode)];
+    PREDICTION_MODE this_mode;
+
+    if (best_ref_frame == INTRA_FRAME) {
+      // Only consider the modes that are included in the intra_mode_list.
+      int intra_modes = sizeof(intra_mode_list)/sizeof(PREDICTION_MODE);
+      int i;
+
+      // TODO(yunqingwang): Check intra mode mask and only update freq_fact
+      // for those valid modes.
+      for (i = 0; i < intra_modes; i++) {
+        PREDICTION_MODE this_mode = intra_mode_list[i];
+        update_thresh_freq_fact(cpi, tile_data, bsize, INTRA_FRAME,
+                                best_mode_idx, this_mode);
+      }
+    } else {
+      for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+        if (best_ref_frame != ref_frame) continue;
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          update_thresh_freq_fact(cpi, tile_data, bsize, ref_frame,
+                                  best_mode_idx, this_mode);
+        }
       }
     }
-    if (cpi->sf.reuse_inter_pred_sby)
-      pd->dst = orig_dst;
   }
+
+  *rd_cost = best_rdc;
+}
+
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                TileDataEnc *tile_data,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
+  VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  SPEED_FEATURES *const sf = &cpi->sf;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct segmentation *const seg = &cm->seg;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
+  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  unsigned char segment_id = mbmi->segment_id;
+  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int64_t best_rd = INT64_MAX;
+  b_mode_info bsi[MAX_REF_FRAMES][4];
+  int ref_frame_skip_mask = 0;
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  int idx, idy;
+
+  x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
+  ctx->pred_pixel_ready = 0;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+    int_mv dummy_mv[2];
+    x->pred_mv_sad[ref_frame] = INT_MAX;
+
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+      int_mv *const candidates = mbmi->ref_mvs[ref_frame];
+      const struct scale_factors *const sf =
+                             &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col,
+                           sf, sf);
+      vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame,
+                       candidates, mi_row, mi_col, NULL, NULL);
+
+      vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
+                            &dummy_mv[0], &dummy_mv[1]);
+    } else {
+      ref_frame_skip_mask |= (1 << ref_frame);
+    }
+  }
+
+  mbmi->sb_type = bsize;
+  mbmi->tx_size = TX_4X4;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = LAST_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+                                                        : cm->interp_filter;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
+    int64_t this_rd = 0;
+    int plane;
+
+    if (ref_frame_skip_mask & (1 << ref_frame))
+      continue;
+
+    // TODO(jingning, agrange): Scaling reference frame not supported for
+    // sub8x8 blocks. Is this supported now?
+    if (ref_frame > INTRA_FRAME &&
+        vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
+      continue;
+
+    // If the segment reference frame feature is enabled....
+    // then do nothing if the current ref frame is not allowed..
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    mbmi->ref_frame[0] = ref_frame;
+    x->skip = 0;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+    // Select prediction reference frames.
+    for (plane = 0; plane < MAX_MB_PLANE; plane++)
+      xd->plane[plane].pre[0] = yv12_mb[ref_frame][plane];
+
+    for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+      for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+        int_mv b_mv[MB_MODE_COUNT];
+        int64_t b_best_rd = INT64_MAX;
+        const int i = idy * 2 + idx;
+        PREDICTION_MODE this_mode;
+        RD_COST this_rdc;
+        unsigned int var_y, sse_y;
+
+        struct macroblock_plane *p = &x->plane[0];
+        struct macroblockd_plane *pd = &xd->plane[0];
+
+        const struct buf_2d orig_src = p->src;
+        const struct buf_2d orig_dst = pd->dst;
+        struct buf_2d orig_pre[2];
+        memcpy(orig_pre, xd->plane[0].pre, sizeof(orig_pre));
+
+        // set buffer pointers for sub8x8 motion search.
+        p->src.buf =
+            &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+        pd->dst.buf =
+            &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
+        pd->pre[0].buf =
+            &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8,
+                                                    i, pd->pre[0].stride)];
+
+        b_mv[ZEROMV].as_int = 0;
+        b_mv[NEWMV].as_int = INVALID_MV;
+        vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col,
+                                      &b_mv[NEARESTMV],
+                                      &b_mv[NEARMV]);
+
+        for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+          int b_rate = 0;
+          xd->mi[0]->bmi[i].as_mv[0].as_int = b_mv[this_mode].as_int;
+
+          if (this_mode == NEWMV) {
+            const int step_param = cpi->sf.mv.fullpel_search_step_param;
+            MV mvp_full;
+            MV tmp_mv;
+            int cost_list[5];
+            const int tmp_col_min = x->mv_col_min;
+            const int tmp_col_max = x->mv_col_max;
+            const int tmp_row_min = x->mv_row_min;
+            const int tmp_row_max = x->mv_row_max;
+            int dummy_dist;
+
+            if (i == 0) {
+              mvp_full.row = b_mv[NEARESTMV].as_mv.row >> 3;
+              mvp_full.col = b_mv[NEARESTMV].as_mv.col >> 3;
+            } else {
+              mvp_full.row = xd->mi[0]->bmi[0].as_mv[0].as_mv.row >> 3;
+              mvp_full.col = xd->mi[0]->bmi[0].as_mv[0].as_mv.col >> 3;
+            }
+
+            vp9_set_mv_search_range(x, &mbmi->ref_mvs[0]->as_mv);
+
+            vp9_full_pixel_search(
+                cpi, x, bsize, &mvp_full, step_param, x->sadperbit4,
+                cond_cost_list(cpi, cost_list),
+                &mbmi->ref_mvs[ref_frame][0].as_mv, &tmp_mv,
+                INT_MAX, 0);
+
+            x->mv_col_min = tmp_col_min;
+            x->mv_col_max = tmp_col_max;
+            x->mv_row_min = tmp_row_min;
+            x->mv_row_max = tmp_row_max;
+
+            // calculate the bit cost on motion vector
+            mvp_full.row = tmp_mv.row * 8;
+            mvp_full.col = tmp_mv.col * 8;
+
+            b_rate += vp9_mv_bit_cost(&mvp_full,
+                                      &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                      x->nmvjointcost, x->mvcost,
+                                      MV_COST_WEIGHT);
+
+            b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                          [INTER_OFFSET(NEWMV)];
+            if (RDCOST(x->rdmult, x->rddiv, b_rate, 0) > b_best_rd)
+              continue;
+
+            cpi->find_fractional_mv_step(x, &tmp_mv,
+                                         &mbmi->ref_mvs[ref_frame][0].as_mv,
+                                         cpi->common.allow_high_precision_mv,
+                                         x->errorperbit,
+                                         &cpi->fn_ptr[bsize],
+                                         cpi->sf.mv.subpel_force_stop,
+                                         cpi->sf.mv.subpel_iters_per_step,
+                                         cond_cost_list(cpi, cost_list),
+                                         x->nmvjointcost, x->mvcost,
+                                         &dummy_dist,
+                                         &x->pred_sse[ref_frame], NULL, 0, 0);
+
+            xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
+          } else {
+            b_rate += cpi->inter_mode_cost[mbmi->mode_context[ref_frame]]
+                                          [INTER_OFFSET(this_mode)];
+          }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            vp9_highbd_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                    pd->dst.buf, pd->dst.stride,
+                                    &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+                                    &xd->block_refs[0]->sf,
+                                    4 * num_4x4_blocks_wide,
+                                    4 * num_4x4_blocks_high, 0,
+                                    vp9_get_interp_kernel(mbmi->interp_filter),
+                                    MV_PRECISION_Q3,
+                                    mi_col * MI_SIZE + 4 * (i & 0x01),
+                                    mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
+          } else {
+#endif
+            vp9_build_inter_predictor(pd->pre[0].buf, pd->pre[0].stride,
+                                     pd->dst.buf, pd->dst.stride,
+                                     &xd->mi[0]->bmi[i].as_mv[0].as_mv,
+                                     &xd->block_refs[0]->sf,
+                                     4 * num_4x4_blocks_wide,
+                                     4 * num_4x4_blocks_high, 0,
+                                     vp9_get_interp_kernel(mbmi->interp_filter),
+                                     MV_PRECISION_Q3,
+                                     mi_col * MI_SIZE + 4 * (i & 0x01),
+                                     mi_row * MI_SIZE + 4 * (i >> 1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif
+
+          model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                            &var_y, &sse_y);
+
+          this_rdc.rate += b_rate;
+          this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
+                                   this_rdc.rate, this_rdc.dist);
+          if (this_rdc.rdcost < b_best_rd) {
+            b_best_rd = this_rdc.rdcost;
+            bsi[ref_frame][i].as_mode = this_mode;
+            bsi[ref_frame][i].as_mv[0].as_mv = xd->mi[0]->bmi[i].as_mv[0].as_mv;
+          }
+        }  // mode search
+
+        // restore source and prediction buffer pointers.
+        p->src = orig_src;
+        pd->pre[0] = orig_pre[0];
+        pd->dst = orig_dst;
+        this_rd += b_best_rd;
+
+        xd->mi[0]->bmi[i] = bsi[ref_frame][i];
+        if (num_4x4_blocks_wide > 1)
+          xd->mi[0]->bmi[i + 1] = xd->mi[0]->bmi[i];
+        if (num_4x4_blocks_high > 1)
+          xd->mi[0]->bmi[i + 2] = xd->mi[0]->bmi[i];
+      }
+    }  // loop through sub8x8 blocks
+
+    if (this_rd < best_rd) {
+      best_rd = this_rd;
+      best_ref_frame = ref_frame;
+    }
+  }  // reference frames
+
+  mbmi->tx_size = TX_4X4;
+  mbmi->ref_frame[0] = best_ref_frame;
+  for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
+    for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
+      const int block = idy * 2 + idx;
+      xd->mi[0]->bmi[block] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_wide > 1)
+        xd->mi[0]->bmi[block + 1] = bsi[best_ref_frame][block];
+      if (num_4x4_blocks_high > 1)
+        xd->mi[0]->bmi[block + 2] = bsi[best_ref_frame][block];
+    }
+  }
+  mbmi->mode = xd->mi[0]->bmi[3].as_mode;
+  ctx->mic = *(xd->mi[0]);
+  ctx->skip_txfm[0] = 0;
+  ctx->skip = 0;
+  // Dummy assignment for speed -5. No effect in speed -6.
+  rd_cost->rdcost = best_rd;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
index 97aeca76a7d..11f44099c1f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.h
@@ -17,14 +17,21 @@
 extern "C" {
 #endif
 
+void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
+                         BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                         const struct TileInfo *const tile,
-                         int mi_row, int mi_col,
-                         int *returnrate,
-                         int64_t *returndistortion,
+                         TileDataEnc *tile_data,
+                         int mi_row, int mi_col, RD_COST *rd_cost,
                          BLOCK_SIZE bsize,
                          PICK_MODE_CONTEXT *ctx);
 
+void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+                                TileDataEnc *tile_data,
+                                int mi_row, int mi_col, RD_COST *rd_cost,
+                                BLOCK_SIZE bsize,
+                                PICK_MODE_CONTEXT *ctx);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnrhvs.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnrhvs.c
new file mode 100644
index 00000000000..e10e0284c58
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_psnrhvs.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This code was originally written by: Gregory Maxwell, at the Daala
+ *  project.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/encoder/vp9_ssim.h"
+
+#if !defined(M_PI)
+# define M_PI (3.141592653589793238462643)
+#endif
+#include <string.h>
+
+void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) {
+  (void) xstride;
+  vp9_fdct8x8_c(x, y, ystride);
+}
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+float csf_y[8][8] = {{1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411,
+    1.00227514334, 0.678296995242, 0.466224900598, 0.3265091542}, {2.2901594831,
+    1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963, 0.868920337363,
+    0.61280991668, 0.436405793551}, {2.08509755623, 2.04793073064,
+    1.34329019223, 1.09205635862, 0.875748795257, 0.670882927016,
+    0.501731932449, 0.372504254596}, {1.48366094411, 1.68731108984,
+    1.09205635862, 0.772819797575, 0.605636379554, 0.48309405692,
+    0.380429446972, 0.295774038565}, {1.00227514334, 1.2305666963,
+    0.875748795257, 0.605636379554, 0.448996256676, 0.352889268808,
+    0.283006984131, 0.226951348204}, {0.678296995242, 0.868920337363,
+    0.670882927016, 0.48309405692, 0.352889268808, 0.27032073436,
+    0.215017739696, 0.17408067321}, {0.466224900598, 0.61280991668,
+    0.501731932449, 0.380429446972, 0.283006984131, 0.215017739696,
+    0.168869545842, 0.136153931001}, {0.3265091542, 0.436405793551,
+    0.372504254596, 0.295774038565, 0.226951348204, 0.17408067321,
+    0.136153931001, 0.109083846276}};
+float csf_cb420[8][8] = {
+    {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+        0.898018824055, 0.74725392039, 0.615105596242}, {2.46074210438,
+        1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+        1.17428548929, 0.996404342439, 0.830890433625}, {1.18284184739,
+        1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+        0.960060382087, 0.849823426169, 0.731221236837}, {1.14982565193,
+        1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+        0.751437590932, 0.685398513368, 0.608694761374}, {1.05017074788,
+        1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+        0.605503172737, 0.55002013668, 0.495804539034}, {0.898018824055,
+        1.17428548929, 0.960060382087, 0.751437590932, 0.605503172737,
+        0.514674450957, 0.454353482512, 0.407050308965}, {0.74725392039,
+        0.996404342439, 0.849823426169, 0.685398513368, 0.55002013668,
+        0.454353482512, 0.389234902883, 0.342353999733}, {0.615105596242,
+        0.830890433625, 0.731221236837, 0.608694761374, 0.495804539034,
+        0.407050308965, 0.342353999733, 0.295530605237}};
+float csf_cr420[8][8] = {
+    {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+        0.867069376285, 0.721500455585, 0.593906509971}, {2.62502345193,
+        1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+        1.13381474809, 0.962064122248, 0.802254508198}, {1.26180942886,
+        1.17180569821, 0.944981930573, 0.990876405848, 0.995903384143,
+        0.926972725286, 0.820534991409, 0.706020324706}, {1.11019789803,
+        1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+        0.725539939514, 0.661776842059, 0.587716619023}, {1.01397751469,
+        1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+        0.584635025748, 0.531064164893, 0.478717061273}, {0.867069376285,
+        1.13381474809, 0.926972725286, 0.725539939514, 0.584635025748,
+        0.496936637883, 0.438694579826, 0.393021669543}, {0.721500455585,
+        0.962064122248, 0.820534991409, 0.661776842059, 0.531064164893,
+        0.438694579826, 0.375820256136, 0.330555063063}, {0.593906509971,
+        0.802254508198, 0.706020324706, 0.587716619023, 0.478717061273,
+        0.393021669543, 0.330555063063, 0.285345396658}};
+
+static double convert_score_db(double _score, double _weight) {
+  return 10 * (log10(255 * 255) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *_src, int _systride,
+                           const unsigned char *_dst, int _dystride,
+                           double _par, int _w, int _h, int _step,
+                           float _csf[8][8]) {
+  float ret;
+  int16_t dct_s[8 * 8], dct_d[8 * 8];
+  tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+  float mask[8][8];
+  int pixels;
+  int x;
+  int y;
+  (void) _par;
+  ret = pixels = 0;
+  /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+   their masking table as "we have used the quantization table for the
+   color component Y of JPEG [6] that has been also obtained on the
+   basis of CSF. Note that the values in quantization table JPEG have
+   been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+   was also constructed from the JPEG matrices. I can not find any obvious
+   scheme of normalizing to produce their table, but if I multiply their
+   CSF by 0.38857 and square the result I get their masking table.
+   I have no idea where this constant comes from, but deviating from it
+   too greatly hurts MOS agreement.
+
+   [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+   Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+   of DCT basis functions", CD-ROM Proceedings of the Third
+   International Workshop on Video Processing and Quality Metrics for Consumer
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+  for (x = 0; x < 8; x++)
+    for (y = 0; y < 8; y++)
+      mask[x][y] = (_csf[x][y] * 0.3885746225901003)
+          * (_csf[x][y] * 0.3885746225901003);
+  for (y = 0; y < _h - 7; y += _step) {
+    for (x = 0; x < _w - 7; x += _step) {
+      int i;
+      int j;
+      float s_means[4];
+      float d_means[4];
+      float s_vars[4];
+      float d_vars[4];
+      float s_gmean = 0;
+      float d_gmean = 0;
+      float s_gvar = 0;
+      float d_gvar = 0;
+      float s_mask = 0;
+      float d_mask = 0;
+      for (i = 0; i < 4; i++)
+        s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          dct_s[i * 8 + j] = _src[(y + i) * _systride + (j + x)];
+          dct_d[i * 8 + j] = _dst[(y + i) * _dystride + (j + x)];
+          s_gmean += dct_s[i * 8 + j];
+          d_gmean += dct_d[i * 8 + j];
+          s_means[sub] += dct_s[i * 8 + j];
+          d_means[sub] += dct_d[i * 8 + j];
+        }
+      }
+      s_gmean /= 64.f;
+      d_gmean /= 64.f;
+      for (i = 0; i < 4; i++)
+        s_means[i] /= 16.f;
+      for (i = 0; i < 4; i++)
+        d_means[i] /= 16.f;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
+          s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
+          d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
+          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub])
+              * (dct_s[i * 8 + j] - s_means[sub]);
+          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub])
+              * (dct_d[i * 8 + j] - d_means[sub]);
+        }
+      }
+      s_gvar *= 1 / 63.f * 64;
+      d_gvar *= 1 / 63.f * 64;
+      for (i = 0; i < 4; i++)
+        s_vars[i] *= 1 / 15.f * 16;
+      for (i = 0; i < 4; i++)
+        d_vars[i] *= 1 / 15.f * 16;
+      if (s_gvar > 0)
+        s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
+      if (d_gvar > 0)
+        d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
+      s_mask = sqrt(s_mask * s_gvar) / 32.f;
+      d_mask = sqrt(d_mask * d_gvar) / 32.f;
+      if (d_mask > s_mask)
+        s_mask = d_mask;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          float err;
+          err = fabs(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]);
+          if (i != 0 || j != 0)
+            err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+          ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+          pixels++;
+        }
+      }
+    }
+  }
+  ret /= pixels;
+  return ret;
+}
+double vp9_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                   double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs) {
+  double psnrhvs;
+  double par = 1.0;
+  int step = 7;
+  vp9_clear_system_state();
+  *y_psnrhvs = calc_psnrhvs(source->y_buffer, source->y_stride, dest->y_buffer,
+                            dest->y_stride, par, source->y_crop_width,
+                            source->y_crop_height, step, csf_y);
+
+  *u_psnrhvs = calc_psnrhvs(source->u_buffer, source->uv_stride, dest->u_buffer,
+                            dest->uv_stride, par, source->uv_crop_width,
+                            source->uv_crop_height, step, csf_cb420);
+
+  *v_psnrhvs = calc_psnrhvs(source->v_buffer, source->uv_stride, dest->v_buffer,
+                            dest->uv_stride, par, source->uv_crop_width,
+                            source->uv_crop_height, step, csf_cr420);
+  psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+
+  return convert_score_db(psnrhvs, 1.0);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
index 2ba1f922bc7..3c07e2c2437 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.c
@@ -19,7 +19,8 @@
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr) {
@@ -29,6 +30,9 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int tmp, eob = -1;
 
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
     tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 16;
@@ -41,12 +45,16 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
                             const int16_t *round_ptr, const int16_t quant,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t dequant_ptr, uint16_t *eob_ptr) {
   int eob = -1;
 
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
     const int rc = 0;
     const int coeff = coeff_ptr[rc];
@@ -56,7 +64,7 @@ void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
     const int64_t tmp =
         (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
          quant) >> 16;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+    qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
     if (tmp)
       eob = 0;
@@ -69,15 +77,20 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant_ptr, uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
   const int coeff_sign = (coeff >> 31);
   const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
   int tmp, eob = -1;
 
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
 
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
     qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
@@ -96,8 +109,12 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant_ptr,
                                   uint16_t *eob_ptr) {
+  const int n_coeffs = 1024;
   int eob = -1;
 
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
   if (!skip_block) {
     const int rc = 0;
     const int coeff = coeff_ptr[rc];
@@ -105,9 +122,9 @@ void vp9_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
     const int64_t tmp =
-        (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
-         quant) >> 15;
-    qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        (clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+               INT32_MIN, INT32_MAX) * quant) >> 15;
+    qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
     if (tmp)
       eob = 0;
@@ -122,18 +139,17 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr,
-                       int zbin_oq_value, uint16_t *eob_ptr,
+                       uint16_t *eob_ptr,
                        const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Quantization pass: All coefficients with index >= zero_flag are
@@ -168,7 +184,6 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
                               tran_low_t *qcoeff_ptr,
                               tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr,
-                              int zbin_oq_value,
                               uint16_t *eob_ptr,
                               const int16_t *scan,
                               const int16_t *iscan) {
@@ -178,11 +193,10 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
   // quantization process is completed.
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Quantization pass: All coefficients with index >= zero_flag are
@@ -197,7 +211,7 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr,
           (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) *
            quant_ptr[rc != 0]) >> 16;
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
       if (tmp)
@@ -217,16 +231,15 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr,
-                             int zbin_oq_value, uint16_t *eob_ptr,
+                             uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     for (i = 0; i < n_coeffs; i++) {
@@ -261,16 +274,15 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
                                     tran_low_t *qcoeff_ptr,
                                     tran_low_t *dqcoeff_ptr,
                                     const int16_t *dequant_ptr,
-                                    int zbin_oq_value, uint16_t *eob_ptr,
+                                    uint16_t *eob_ptr,
                                     const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
-  (void)zbin_oq_value;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     for (i = 0; i < n_coeffs; i++) {
@@ -284,7 +296,7 @@ void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
         tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
                     INT32_MIN, INT32_MAX);
         tmp = (tmp * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+        qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
       }
 
@@ -302,17 +314,15 @@ void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr,
-                      int zbin_oq_value, uint16_t *eob_ptr,
+                      uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
-                         zbin_ptr[1] + zbin_oq_value };
-  const int nzbins[2] = { zbins[0] * -1,
-                          zbins[1] * -1 };
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Pre-scan pass
@@ -355,18 +365,16 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *round_ptr, const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, int zbin_oq_value,
+                             const int16_t *dequant_ptr,
                              uint16_t *eob_ptr, const int16_t *scan,
                              const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
-                         zbin_ptr[1] + zbin_oq_value };
-  const int nzbins[2] = { zbins[0] * -1,
-                          zbins[1] * -1 };
+  const int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Pre-scan pass
@@ -393,7 +401,7 @@ void vp9_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             INT32_MIN, INT32_MAX);
         tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
                   quant_shift_ptr[rc != 0]) >> 16;  // quantization
-        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        qcoeff_ptr[rc]  = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
         if (tmp)
@@ -412,10 +420,10 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                             const int16_t *quant_shift_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr,
-                            int zbin_oq_value, uint16_t *eob_ptr,
+                            uint16_t *eob_ptr,
                             const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
   const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
 
   int idx = 0;
@@ -423,8 +431,8 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int i, eob = -1;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Pre-scan pass
@@ -471,19 +479,19 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
                                    tran_low_t *qcoeff_ptr,
                                    tran_low_t *dqcoeff_ptr,
                                    const int16_t *dequant_ptr,
-                                   int zbin_oq_value, uint16_t *eob_ptr,
+                                   uint16_t *eob_ptr,
                                    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int zbins[2] = {ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
+                        ROUND_POWER_OF_TWO(zbin_ptr[1], 1)};
+  const int nzbins[2] = {zbins[0] * -1, zbins[1] * -1};
 
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
 
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
     // Pre-scan pass
@@ -510,7 +518,7 @@ void vp9_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
       tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
                quant_shift_ptr[rc != 0]) >> 15;
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      qcoeff_ptr[rc] = (tran_low_t)((tmp ^ coeff_sign) - coeff_sign);
       dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
 
       if (tmp)
@@ -530,21 +538,21 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vp9_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
-                        16, x->skip_block,
-                        p->zbin, p->round, p->quant, p->quant_shift,
-                        BLOCK_OFFSET(p->qcoeff, block),
-                        BLOCK_OFFSET(pd->dqcoeff, block),
-                        pd->dequant, p->zbin_extra, &p->eobs[block],
-                        scan, iscan);
+                          16, x->skip_block,
+                          p->zbin, p->round, p->quant, p->quant_shift,
+                          BLOCK_OFFSET(p->qcoeff, block),
+                          BLOCK_OFFSET(pd->dqcoeff, block),
+                          pd->dequant, &p->eobs[block],
+                          scan, iscan);
     return;
   }
 #endif
   vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
-           16, x->skip_block,
-           p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(p->qcoeff, block),
-           BLOCK_OFFSET(pd->dqcoeff, block),
-           pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
+                 16, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift,
+                 BLOCK_OFFSET(p->qcoeff, block),
+                 BLOCK_OFFSET(pd->dqcoeff, block),
+                 pd->dequant, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -600,7 +608,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
-      cm->y_dequant[q][i] = quant;
+      cpi->y_dequant[q][i] = quant;
 
       // uv
       quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth)
@@ -611,7 +619,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
       quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
       quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
-      cm->uv_dequant[q][i] = quant;
+      cpi->uv_dequant[q][i] = quant;
     }
 
     for (i = 2; i < 8; i++) {
@@ -621,7 +629,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
       quants->y_zbin[q][i] = quants->y_zbin[q][1];
       quants->y_round[q][i] = quants->y_round[q][1];
-      cm->y_dequant[q][i] = cm->y_dequant[q][1];
+      cpi->y_dequant[q][i] = cpi->y_dequant[q][1];
 
       quants->uv_quant[q][i] = quants->uv_quant[q][1];
       quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
@@ -629,7 +637,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
       quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
       quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
       quants->uv_round[q][i] = quants->uv_round[q][1];
-      cm->uv_dequant[q][i] = cm->uv_dequant[q][1];
+      cpi->uv_dequant[q][i] = cpi->uv_dequant[q][1];
     }
   }
 }
@@ -638,10 +646,9 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   QUANTS *const quants = &cpi->quants;
-  const int segment_id = xd->mi[0].src_mi->mbmi.segment_id;
+  const int segment_id = xd->mi[0]->mbmi.segment_id;
   const int qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   const int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
-  const int zbin = cpi->zbin_mode_boost;
   int i;
 
   // Y
@@ -651,13 +658,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
-  x->plane[0].zbin_extra = (int16_t)((cm->y_dequant[qindex][1] * zbin) >> 7);
-  xd->plane[0].dequant = cm->y_dequant[qindex];
+  xd->plane[0].dequant = cpi->y_dequant[qindex];
 
-  x->plane[0].quant_thred[0] = (x->plane[0].zbin[0] + x->plane[0].zbin_extra) *
-      (x->plane[0].zbin[0] + x->plane[0].zbin_extra);
-  x->plane[0].quant_thred[1] = (x->plane[0].zbin[1] + x->plane[0].zbin_extra) *
-      (x->plane[0].zbin[1] + x->plane[0].zbin_extra);
+  x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
+  x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
 
   // UV
   for (i = 1; i < 3; i++) {
@@ -667,15 +671,10 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
-    x->plane[i].zbin_extra = (int16_t)((cm->uv_dequant[qindex][1] * zbin) >> 7);
-    xd->plane[i].dequant = cm->uv_dequant[qindex];
-
-    x->plane[i].quant_thred[0] =
-        (x->plane[i].zbin[0] + x->plane[i].zbin_extra) *
-        (x->plane[i].zbin[0] + x->plane[i].zbin_extra);
-    x->plane[i].quant_thred[1] =
-        (x->plane[i].zbin[1] + x->plane[i].zbin_extra) *
-        (x->plane[i].zbin[1] + x->plane[i].zbin_extra);
+    xd->plane[i].dequant = cpi->uv_dequant[qindex];
+
+    x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
+    x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
 
   x->skip_block = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
@@ -684,24 +683,11 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   x->errorperbit = rdmult >> 6;
   x->errorperbit += (x->errorperbit == 0);
 
-  vp9_initialize_me_consts(cpi, x->q_index);
-}
-
-void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  const int qindex = x->q_index;
-  const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
-                            cpi->zbin_mode_boost) >> 7;
-  const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
-                             cpi->zbin_mode_boost) >> 7;
-
-  x->plane[0].zbin_extra = (int16_t)y_zbin_extra;
-  x->plane[1].zbin_extra = (int16_t)uv_zbin_extra;
-  x->plane[2].zbin_extra = (int16_t)uv_zbin_extra;
+  vp9_initialize_me_consts(cpi, x, x->q_index);
 }
 
 void vp9_frame_init_quantizer(VP9_COMP *cpi) {
-  cpi->zbin_mode_boost = 0;
-  vp9_init_plane_quantizers(cpi, &cpi->mb);
+  vp9_init_plane_quantizers(cpi, &cpi->td.mb);
 }
 
 void vp9_set_quantizer(VP9_COMMON *cm, int q) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
index cee46e7e033..55e546944a7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_quantize.h
@@ -37,7 +37,8 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr,
+                     int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant_ptr,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant_ptr, uint16_t *eob_ptr);
@@ -49,7 +50,8 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+void vp9_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
                             const int16_t *round_ptr, const int16_t quant_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t dequant_ptr, uint16_t *eob_ptr);
@@ -68,8 +70,6 @@ struct VP9Common;
 
 void vp9_frame_init_quantizer(struct VP9_COMP *cpi);
 
-void vp9_update_zbin_extra(struct VP9_COMP *cpi, MACROBLOCK *x);
-
 void vp9_init_plane_quantizers(struct VP9_COMP *cpi, MACROBLOCK *x);
 
 void vp9_init_quantizer(struct VP9_COMP *cpi);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
index 65bca669a89..4c33ffd977b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -18,6 +18,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_quant_common.h"
@@ -185,9 +186,9 @@ int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
   return (int)(enumerator * correction_factor / q);
 }
 
-static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
-                              double correction_factor,
-                              vpx_bit_depth_t bit_depth) {
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth) {
   const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor,
                                            bit_depth));
   return MAX(FRAME_OVERHEAD_BITS,
@@ -196,6 +197,7 @@ static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
+  const VP9EncoderConfig *oxcf = &cpi->oxcf;
   const int min_frame_target = MAX(rc->min_frame_bandwidth,
                                    rc->avg_frame_bandwidth >> 5);
   if (target < min_frame_target)
@@ -210,6 +212,11 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth)
     target = rc->max_frame_bandwidth;
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = MIN(target, max_rate);
+  }
   return target;
 }
 
@@ -226,7 +233,6 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
   return target;
 }
 
-
 // Update the buffer level for higher layers, given the encoded current layer.
 static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
   int temporal_layer = 0;
@@ -354,26 +360,34 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
 
 static double get_rate_correction_factor(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  double rcf;
 
   if (cpi->common.frame_type == KEY_FRAME) {
-    return rc->rate_correction_factors[KF_STD];
+    rcf = rc->rate_correction_factors[KF_STD];
   } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
       cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
-    return rc->rate_correction_factors[rf_lvl];
+    rcf = rc->rate_correction_factors[rf_lvl];
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref &&
-        !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
-      return rc->rate_correction_factors[GF_ARF_STD];
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+      rcf = rc->rate_correction_factors[GF_ARF_STD];
     else
-      return rc->rate_correction_factors[INTER_NORMAL];
+      rcf = rc->rate_correction_factors[INTER_NORMAL];
   }
+  rcf *= rcf_mult[rc->frame_size_selector];
+  return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 }
 
 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   RATE_CONTROL *const rc = &cpi->rc;
 
+  // Normalize RCF to account for the size-dependent scaling factor.
+  factor /= rcf_mult[cpi->rc.frame_size_selector];
+
+  factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
   if (cpi->common.frame_type == KEY_FRAME) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (cpi->oxcf.pass == 2) {
@@ -382,15 +396,15 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
     rc->rate_correction_factors[rf_lvl] = factor;
   } else {
     if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
-        !rc->is_src_frame_alt_ref &&
-        !(cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR))
+        !rc->is_src_frame_alt_ref && !cpi->use_svc &&
+        (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
       rc->rate_correction_factors[GF_ARF_STD] = factor;
     else
       rc->rate_correction_factors[INTER_NORMAL] = factor;
   }
 }
 
-void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   int correction_factor = 100;
   double rate_correction_factor = get_rate_correction_factor(cpi);
@@ -408,36 +422,41 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   // Work out how big we would have expected the frame to be at this Q given
   // the current correction factor.
   // Stay in double to avoid int overflow when values are large
-  projected_size_based_on_q = estimate_bits_at_q(cm->frame_type,
-                                                 cm->base_qindex, cm->MBs,
-                                                 rate_correction_factor,
-                                                 cm->bit_depth);
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+    projected_size_based_on_q =
+        vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+  } else {
+    projected_size_based_on_q = vp9_estimate_bits_at_q(cpi->common.frame_type,
+                                                       cm->base_qindex,
+                                                       cm->MBs,
+                                                       rate_correction_factor,
+                                                       cm->bit_depth);
+  }
   // Work out a size correction factor.
   if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
-    correction_factor = (100 * cpi->rc.projected_frame_size) /
-                            projected_size_based_on_q;
+    correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+                        projected_size_based_on_q);
 
   // More heavily damped adjustment used if we have been oscillating either side
   // of target.
-  switch (damp_var) {
-    case 0:
-      adjustment_limit = 0.75;
-      break;
-    case 1:
-      adjustment_limit = 0.375;
-      break;
-    case 2:
-    default:
-      adjustment_limit = 0.25;
-      break;
-  }
+  adjustment_limit = 0.25 +
+      0.5 * MIN(1, fabs(log10(0.01 * correction_factor)));
+
+  cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+  cpi->rc.q_1_frame = cm->base_qindex;
+  cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+  if (correction_factor > 110)
+    cpi->rc.rc_1_frame = -1;
+  else if (correction_factor < 90)
+    cpi->rc.rc_1_frame = 1;
+  else
+    cpi->rc.rc_1_frame = 0;
 
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
     correction_factor = (int)(100 + ((correction_factor - 100) *
                                   adjustment_limit));
     rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
-
     // Keep rate_correction_factor within limits
     if (rate_correction_factor > MAX_BPB_FACTOR)
       rate_correction_factor = MAX_BPB_FACTOR;
@@ -461,7 +480,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
   const VP9_COMMON *const cm = &cpi->common;
   int q = active_worst_quality;
   int last_error = INT_MAX;
-  int i, target_bits_per_mb;
+  int i, target_bits_per_mb, bits_per_mb_at_this_q;
   const double correction_factor = get_rate_correction_factor(cpi);
 
   // Calculate required scaling factor based on target frame size and size of
@@ -472,9 +491,14 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
   i = active_best_quality;
 
   do {
-    const int bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
-                                                              correction_factor,
-                                                              cm->bit_depth);
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+      bits_per_mb_at_this_q =
+          (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+    } else {
+      bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i,
+                                                      correction_factor,
+                                                      cm->bit_depth);
+    }
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -488,6 +512,14 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
     }
   } while (++i <= active_worst_quality);
 
+  // In CBR mode, this makes sure q is between oscillating Qs to prevent
+  // resonance.
+  if (cpi->oxcf.rc_mode == VPX_CBR &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    q = clamp(q, MIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+              MAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+  }
   return q;
 }
 
@@ -557,18 +589,23 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *rc = &cpi->rc;
   // Buffer level below which we push active_worst to worst_quality.
-  int64_t critical_level = rc->optimal_buffer_level >> 2;
+  int64_t critical_level = rc->optimal_buffer_level >> 3;
   int64_t buff_lvl_step = 0;
   int adjustment = 0;
   int active_worst_quality;
+  int ambient_qp;
   if (cm->frame_type == KEY_FRAME)
-    return rc->worst_quality * 4 / 5;
-  if (cm->current_video_frame > 1)
-    active_worst_quality = MIN(rc->worst_quality,
-                               rc->avg_frame_qindex[INTER_FRAME] * 5 / 4);
-  else
-    active_worst_quality = MIN(rc->worst_quality,
-                               rc->avg_frame_qindex[KEY_FRAME] * 3 / 2);
+    return rc->worst_quality;
+  // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+  // for the first few frames following key frame. These are both initialized
+  // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+  // So for first few frames following key, the qp of that key frame is weighted
+  // into the active_worst_quality setting.
+  ambient_qp = (cm->current_video_frame < 5) ?
+      MIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) :
+      rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = MIN(rc->worst_quality,
+                             ambient_qp * 5 / 4);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
     // Maximum limit for down adjustment, ~30%.
@@ -586,12 +623,11 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
     if (critical_level) {
       buff_lvl_step = (rc->optimal_buffer_level - critical_level);
       if (buff_lvl_step) {
-        adjustment =
-            (int)((rc->worst_quality - rc->avg_frame_qindex[INTER_FRAME]) *
-                  (rc->optimal_buffer_level - rc->buffer_level) /
-                  buff_lvl_step);
+        adjustment = (int)((rc->worst_quality - ambient_qp) *
+                           (rc->optimal_buffer_level - rc->buffer_level) /
+                           buff_lvl_step);
       }
-      active_worst_quality = rc->avg_frame_qindex[INTER_FRAME] + adjustment;
+      active_worst_quality = ambient_qp + adjustment;
     }
   } else {
     // Set to worst_quality if buffer is below critical level.
@@ -720,7 +756,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
 
 static int get_active_cq_level(const RATE_CONTROL *rc,
                                const VP9EncoderConfig *const oxcf) {
-  static const double cq_adjust_threshold = 0.5;
+  static const double cq_adjust_threshold = 0.1;
   int active_cq_level = oxcf->cq_level;
   if (oxcf->rc_mode == VPX_CQ &&
       rc->total_target_bits > 0) {
@@ -883,6 +919,23 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   return q;
 }
 
+int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) {
+  static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+    1.00,  // INTER_NORMAL
+    1.00,  // INTER_HIGH
+    1.50,  // GF_ARF_LOW
+    1.75,  // GF_ARF_STD
+    2.00,  // KF_STD
+  };
+  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+      {INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME};
+  const VP9_COMMON *const cm = &cpi->common;
+  int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level],
+                                          q, rate_factor_deltas[rf_level],
+                                          cm->bit_depth);
+  return qdelta;
+}
+
 #define STATIC_MOTION_THRESH 95
 static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
                                          int *bottom_index,
@@ -890,6 +943,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
   const int cq_level = get_active_cq_level(rc, oxcf);
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
@@ -971,7 +1025,12 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
       if (!cpi->refresh_alt_ref_frame) {
         active_best_quality = cq_level;
       } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+
+        // Modify best quality for second level arfs. For mode VPX_Q this
+        // becomes the baseline frame q.
+        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+          active_best_quality = (active_best_quality + cq_level + 1) / 2;
       }
     } else {
       active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
@@ -991,9 +1050,9 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
     }
   }
 
-  // Extenstion to max or min Q if undershoot or overshoot is outside
+  // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if ((cpi->oxcf.rc_mode == VPX_VBR) &&
+  if ((cpi->oxcf.rc_mode != VPX_Q) &&
       (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
     if (frame_is_intra_only(cm) ||
         (!rc->is_src_frame_alt_ref &&
@@ -1012,25 +1071,21 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi,
   if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) ||
       !rc->this_key_frame_forced ||
       (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
-      1.00,  // INTER_NORMAL
-      1.00,  // INTER_HIGH
-      1.50,  // GF_ARF_LOW
-      1.75,  // GF_ARF_STD
-      2.00,  // KF_STD
-    };
-    const double rate_factor =
-      rate_factor_deltas[gf_group->rf_level[gf_group->index]];
-    int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
-                                            active_worst_quality, rate_factor,
-                                            cm->bit_depth);
-    active_worst_quality = active_worst_quality + qdelta;
-    active_worst_quality = MAX(active_worst_quality, active_best_quality);
+    int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+                                       active_worst_quality);
+    active_worst_quality = MAX(active_worst_quality + qdelta,
+                               active_best_quality);
   }
 #endif
 
-  // Clip the active best and worst quality values to limits.
+  // Modify active_best_quality for downscaled normal frames.
+  if (rc->frame_size_selector != UNSCALED && !frame_is_kf_gf_arf(cpi)) {
+    int qdelta = vp9_compute_qdelta_by_rate(rc, cm->frame_type,
+                                            active_best_quality, 2.0,
+                                            cm->bit_depth);
+    active_best_quality = MAX(active_best_quality + qdelta, rc->best_quality);
+  }
+
   active_best_quality = clamp(active_best_quality,
                               rc->best_quality, rc->worst_quality);
   active_worst_quality = clamp(active_worst_quality,
@@ -1117,6 +1172,12 @@ void vp9_rc_set_frame_target(VP9_COMP *cpi, int target) {
 
   rc->this_frame_target = target;
 
+  // Modify frame size target when down-scaling.
+  if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC &&
+      rc->frame_size_selector != UNSCALED)
+    rc->this_frame_target = (int)(rc->this_frame_target
+        * rate_thresh_mult[rc->frame_size_selector]);
+
   // Target rate per SB64 (including partial SB64s.
   rc->sb64_target_rate = ((int64_t)rc->this_frame_target * 64 * 64) /
                              (cm->width * cm->height);
@@ -1169,13 +1230,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   RATE_CONTROL *const rc = &cpi->rc;
   const int qindex = cm->base_qindex;
 
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+    vp9_cyclic_refresh_postencode(cpi);
+  }
+
   // Update rate control heuristics
   rc->projected_frame_size = (int)(bytes_used << 3);
 
   // Post encode loop adjustment of Q prediction.
-  vp9_rc_update_rate_correction_factors(
-      cpi, (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF ||
-            oxcf->rc_mode == VPX_CBR) ? 2 : 0);
+  vp9_rc_update_rate_correction_factors(cpi);
 
   // Keep a record of last Q and ambient average Q.
   if (cm->frame_type == KEY_FRAME) {
@@ -1205,7 +1268,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   // better than that already stored.
   // This is used to help set quality in forced key frames to reduce popping
   if ((qindex < rc->last_boosted_qindex) ||
-      (((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+      (cm->frame_type == KEY_FRAME) ||
+      (!rc->constrained_gf_group &&
+       (cpi->refresh_alt_ref_frame ||
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
@@ -1247,14 +1312,20 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
+
+  // Trigger the resizing of the next frame if it is scaled.
+  cpi->resize_pending =
+      rc->next_frame_size_selector != rc->frame_size_selector;
+  rc->frame_size_selector = rc->next_frame_size_selector;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
   // Update buffer level with zero size, update frame counters, and return.
   update_buffer_level(cpi, 0);
-  cpi->common.last_frame_type = cpi->common.frame_type;
   cpi->rc.frames_since_key++;
   cpi->rc.frames_to_key--;
+  cpi->rc.rc_2_frame = 0;
+  cpi->rc.rc_1_frame = 0;
 }
 
 // Use this macro to turn on/off use of alt-refs in one-pass mode.
@@ -1307,8 +1378,12 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     // NOTE: frames_till_gf_update_due must be <= frames_to_key.
-    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+    if (rc->frames_till_gf_update_due > rc->frames_to_key) {
       rc->frames_till_gf_update_due = rc->frames_to_key;
+      rc->constrained_gf_group = 1;
+    } else {
+      rc->constrained_gf_group = 0;
+    }
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
     rc->gfu_boost = DEFAULT_GF_BOOST;
@@ -1327,7 +1402,18 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
   const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
   int min_frame_target = MAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
-  int target = rc->avg_frame_bandwidth;
+  int target;
+
+  if (oxcf->gf_cbr_boost_pct) {
+    const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+    target =  cpi->refresh_golden_frame ?
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) :
+      (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+      (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+  } else {
+    target = rc->avg_frame_bandwidth;
+  }
   if (svc->number_temporal_layers > 1 &&
       oxcf->rc_mode == VPX_CBR) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
@@ -1347,6 +1433,11 @@ static int calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
     const int pct_high = (int)MIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
     target += (target * pct_high) / 200;
   }
+  if (oxcf->rc_max_inter_bitrate_pct) {
+    const int max_rate = rc->avg_frame_bandwidth *
+                         oxcf->rc_max_inter_bitrate_pct / 100;
+    target = MIN(target, max_rate);
+  }
   return MAX(min_frame_target, target);
 }
 
@@ -1416,6 +1507,12 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       target = calc_pframe_target_size_one_pass_cbr(cpi);
     }
   }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
   vp9_rc_set_frame_target(cpi, target);
   rc->frames_till_gf_update_due = INT_MAX;
   rc->baseline_gf_interval = INT_MAX;
@@ -1436,15 +1533,33 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
     rc->source_alt_ref_active = 0;
-    target = calc_iframe_target_size_one_pass_cbr(cpi);
   } else {
     cm->frame_type = INTER_FRAME;
-    target = calc_pframe_target_size_one_pass_cbr(cpi);
   }
+  if (rc->frames_till_gf_update_due == 0) {
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+      vp9_cyclic_refresh_set_golden_update(cpi);
+    else
+      rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+    rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+    // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+    if (rc->frames_till_gf_update_due > rc->frames_to_key)
+      rc->frames_till_gf_update_due = rc->frames_to_key;
+    cpi->refresh_golden_frame = 1;
+    rc->gfu_boost = DEFAULT_GF_BOOST;
+  }
+
+  // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+  // should be done here, before the frame qp is selected.
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    vp9_cyclic_refresh_update_parameters(cpi);
+
+  if (cm->frame_type == KEY_FRAME)
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  else
+    target = calc_pframe_target_size_one_pass_cbr(cpi);
+
   vp9_rc_set_frame_target(cpi, target);
-  // Don't use gf_update by default in CBR mode.
-  rc->frames_till_gf_update_due = INT_MAX;
-  rc->baseline_gf_interval = INT_MAX;
 }
 
 int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
@@ -1485,19 +1600,30 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
 
   // Convert the q target to an index
   for (i = rc->best_quality; i < rc->worst_quality; ++i) {
-    target_index = i;
-    if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <= target_bits_per_mb)
+    if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+        target_bits_per_mb) {
+      target_index = i;
       break;
+    }
   }
-
   return target_index - qindex;
 }
 
-void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi,
-                                RATE_CONTROL *const rc) {
+#define MIN_GF_INTERVAL     4
+#define MAX_GF_INTERVAL     16
+void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
+                                  RATE_CONTROL *const rc) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  // Set Maximum gf/arf interval
-  rc->max_gf_interval = 16;
+
+  // Set a minimum interval.
+  rc->min_gf_interval =
+    MIN(MAX_GF_INTERVAL, MAX(MIN_GF_INTERVAL, (int)(cpi->framerate * 0.125)));
+
+  // Set Maximum gf/arf interval.
+  rc->max_gf_interval =
+    MIN(MAX_GF_INTERVAL, (int)(cpi->framerate * 0.75));
+  // Round up to next even number if odd.
+  rc->max_gf_interval += (rc->max_gf_interval & 0x01);
 
   // Extended interval for genuinely static scenes
   rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
@@ -1509,6 +1635,9 @@ void vp9_rc_set_gf_max_interval(const VP9_COMP *const cpi,
 
   if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
     rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+  // Clamp min to max
+  rc->min_gf_interval = MIN(rc->min_gf_interval, rc->max_gf_interval);
 }
 
 void vp9_rc_update_framerate(VP9_COMP *cpi) {
@@ -1535,5 +1664,45 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   rc->max_frame_bandwidth = MAX(MAX((cm->MBs * MAX_MB_RATE), MAXRATE_1080P),
                                     vbr_max_bits);
 
-  vp9_rc_set_gf_max_interval(cpi, rc);
+  vp9_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(VP9_COMP *cpi,
+                                int *this_frame_target,
+                                int64_t vbr_bits_off_target) {
+  int max_delta;
+  double position_factor = 1.0;
+
+  // How far through the clip are we.
+  // This number is used to damp the per frame rate correction.
+  // Range 0 - 1.0
+  if (cpi->twopass.total_stats.count) {
+    position_factor = sqrt((double)cpi->common.current_video_frame /
+                           cpi->twopass.total_stats.count);
+  }
+  max_delta = (int)(position_factor *
+                    ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+  // vbr_bits_off_target > 0 means we have extra bits to spend
+  if (vbr_bits_off_target > 0) {
+    *this_frame_target +=
+      (vbr_bits_off_target > max_delta) ? max_delta
+                                        : (int)vbr_bits_off_target;
+  } else {
+    *this_frame_target -=
+      (vbr_bits_off_target < -max_delta) ? max_delta
+                                         : (int)-vbr_bits_off_target;
+  }
+}
+
+void vp9_set_target_rate(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  int target_rate = rc->base_frame_target;
+
+  // Correction to rate target based on prior over or under shoot.
+  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+    vbr_rate_correction(cpi, &target_rate, rc->vbr_bits_off_target);
+  vp9_rc_set_frame_target(cpi, target_rate);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
index bc74129e591..869f6e59e97 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -33,6 +33,27 @@ typedef enum {
   RATE_FACTOR_LEVELS = 5
 } RATE_FACTOR_LEVEL;
 
+// Internal frame scaling level.
+typedef enum {
+  UNSCALED = 0,     // Frame is unscaled.
+  SCALE_STEP1 = 1,  // First-level down-scaling.
+  FRAME_SCALE_STEPS
+} FRAME_SCALE_LEVEL;
+
+// Frame dimensions multiplier wrt the native frame size, in 1/16ths,
+// specified for the scale-up case.
+// e.g. 24 => 16/24 = 2/3 of native size. The restriction to 1/16th is
+// intended to match the capabilities of the normative scaling filters,
+// giving precedence to the up-scaling accuracy.
+static const int frame_scale_factor[FRAME_SCALE_STEPS] = {16, 24};
+
+// Multiplier of the target rate to be used as threshold for triggering scaling.
+static const double rate_thresh_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
+// Scale dependent Rate Correction Factor multipliers. Compensates for the
+// greater number of bits per pixel generated in down-scaled frames.
+static const double rcf_mult[FRAME_SCALE_STEPS] = {1.0, 2.0};
+
 typedef struct {
   // Rate targetting variables
   int base_frame_target;           // A baseline frame target before adjustment
@@ -52,9 +73,11 @@ typedef struct {
 
   int frames_since_golden;
   int frames_till_gf_update_due;
+  int min_gf_interval;
   int max_gf_interval;
   int static_scene_max_gf_interval;
   int baseline_gf_interval;
+  int constrained_gf_group;
   int frames_to_key;
   int frames_since_key;
   int this_key_frame_forced;
@@ -99,7 +122,22 @@ typedef struct {
   int64_t starting_buffer_level;
   int64_t optimal_buffer_level;
   int64_t maximum_buffer_size;
-  // int active_best_quality;
+
+  // rate control history for last frame(1) and the frame before(2).
+  // -1: undershot
+  //  1: overshoot
+  //  0: not initialized.
+  int rc_1_frame;
+  int rc_2_frame;
+  int q_1_frame;
+  int q_2_frame;
+
+  // Auto frame-scaling variables.
+  FRAME_SCALE_LEVEL frame_size_selector;
+  FRAME_SCALE_LEVEL next_frame_size_selector;
+  int frame_width[FRAME_SCALE_STEPS];
+  int frame_height[FRAME_SCALE_STEPS];
+  int rf_level_maxq[RATE_FACTOR_LEVELS];
 } RATE_CONTROL;
 
 struct VP9_COMP;
@@ -108,6 +146,10 @@ struct VP9EncoderConfig;
 void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
                  RATE_CONTROL *rc);
 
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+                           double correction_factor,
+                           vpx_bit_depth_t bit_depth);
+
 double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
 
 void vp9_rc_init_minq_luts();
@@ -148,7 +190,7 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
 
 // Updates rate correction factors
 // Changes only the rate correction factors in the rate control structure.
-void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi, int damp_var);
+void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
 
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
@@ -193,10 +235,14 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
                                int qindex, double rate_target_ratio,
                                vpx_bit_depth_t bit_depth);
 
+int vp9_frame_type_qdelta(const struct VP9_COMP *cpi, int rf_level, int q);
+
 void vp9_rc_update_framerate(struct VP9_COMP *cpi);
 
-void vp9_rc_set_gf_max_interval(const struct VP9_COMP *const cpi,
-                                RATE_CONTROL *const rc);
+void vp9_rc_set_gf_interval_range(const struct VP9_COMP *const cpi,
+                                  RATE_CONTROL *const rc);
+
+void vp9_set_target_rate(struct VP9_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
index 7f526fc4234..194001c51a2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.c
@@ -65,7 +65,7 @@ static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
 };
 
 static void fill_mode_costs(VP9_COMP *cpi) {
-  const FRAME_CONTEXT *const fc = &cpi->common.fc;
+  const FRAME_CONTEXT *const fc = cpi->common.fc;
   int i, j;
 
   for (i = 0; i < INTRA_MODES; ++i)
@@ -204,27 +204,28 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
   return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
-void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
+void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
 #if CONFIG_VP9_HIGHBITDEPTH
   switch (cpi->common.bit_depth) {
     case VPX_BITS_8:
-      cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
-      cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+      x->sadperbit16 = sad_per_bit16lut_8[qindex];
+      x->sadperbit4 = sad_per_bit4lut_8[qindex];
       break;
     case VPX_BITS_10:
-      cpi->mb.sadperbit16 = sad_per_bit16lut_10[qindex];
-      cpi->mb.sadperbit4 = sad_per_bit4lut_10[qindex];
+      x->sadperbit16 = sad_per_bit16lut_10[qindex];
+      x->sadperbit4 = sad_per_bit4lut_10[qindex];
       break;
     case VPX_BITS_12:
-      cpi->mb.sadperbit16 = sad_per_bit16lut_12[qindex];
-      cpi->mb.sadperbit4 = sad_per_bit4lut_12[qindex];
+      x->sadperbit16 = sad_per_bit16lut_12[qindex];
+      x->sadperbit4 = sad_per_bit4lut_12[qindex];
       break;
     default:
       assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
   }
 #else
-  cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex];
-  cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex];
+  (void)cpi;
+  x->sadperbit16 = sad_per_bit16lut_8[qindex];
+  x->sadperbit4 = sad_per_bit4lut_8[qindex];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
@@ -262,7 +263,7 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
 
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   RD_OPT *const rd = &cpi->rd;
   int i;
 
@@ -279,9 +280,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 
   set_block_thresholds(cm, rd);
 
-  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
-    fill_token_costs(x->token_costs, cm->fc.coef_probs);
+  if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
+    fill_token_costs(x->token_costs, cm->fc->coef_probs);
 
+  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+      cm->frame_type == KEY_FRAME) {
     for (i = 0; i < PARTITION_CONTEXTS; ++i)
       vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(cm, i),
                       vp9_partition_tree);
@@ -295,11 +298,11 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
       vp9_build_nmv_cost_table(x->nmvjointcost,
                                cm->allow_high_precision_mv ? x->nmvcost_hp
                                                            : x->nmvcost,
-                               &cm->fc.nmvc, cm->allow_high_precision_mv);
+                               &cm->fc->nmvc, cm->allow_high_precision_mv);
 
       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
         vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
-                        cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
+                        cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
     }
   }
 }
@@ -379,7 +382,7 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist) {
   // This function models the rate and distortion for a Laplacian
@@ -395,10 +398,10 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
     int d_q10, r_q10;
     static const uint32_t MAX_XSQ_Q10 = 245727;
     const uint64_t xsq_q10_64 =
-        ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
+        (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)MIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
-    *rate = (n * r_q10 + 2) >> 2;
+    *rate = ((r_q10 << n_log2) + 2) >> 2;
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
   }
 }
@@ -416,8 +419,8 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
   int i;
   switch (tx_size) {
     case TX_4X4:
-      vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
-      vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+      memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+      memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
       break;
     case TX_8X8:
       for (i = 0; i < num_4x4_w; i += 2)
@@ -447,41 +450,47 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
                  uint8_t *ref_y_buffer, int ref_y_stride,
                  int ref_frame, BLOCK_SIZE block_size) {
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int i;
   int zero_seen = 0;
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
   int max_mv = 0;
+  int near_same_nearest;
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
   const int num_mv_refs = MAX_MV_REF_CANDIDATES +
                     (cpi->sf.adaptive_motion_search &&
-                     block_size < cpi->sf.max_partition_size);
+                     block_size < x->max_partition_size);
 
   MV pred_mv[3];
   pred_mv[0] = mbmi->ref_mvs[ref_frame][0].as_mv;
   pred_mv[1] = mbmi->ref_mvs[ref_frame][1].as_mv;
   pred_mv[2] = x->pred_mv[ref_frame];
+  assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
 
+  near_same_nearest =
+      mbmi->ref_mvs[ref_frame][0].as_int == mbmi->ref_mvs[ref_frame][1].as_int;
   // Get the sad for each candidate reference mv.
   for (i = 0; i < num_mv_refs; ++i) {
     const MV *this_mv = &pred_mv[i];
+    int fp_row, fp_col;
 
-    max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
-    if (is_zero_mv(this_mv) && zero_seen)
+    if (i == 1 && near_same_nearest)
       continue;
+    fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+    fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+    max_mv = MAX(max_mv, MAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
 
-    zero_seen |= is_zero_mv(this_mv);
-
-    ref_y_ptr =
-        &ref_y_buffer[ref_y_stride * (this_mv->row >> 3) + (this_mv->col >> 3)];
+    if (fp_row ==0 && fp_col == 0 && zero_seen)
+      continue;
+    zero_seen |= (fp_row ==0 && fp_col == 0);
 
+    ref_y_ptr =&ref_y_buffer[ref_y_stride * fp_row + fp_col];
     // Find sad for current vector.
     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
                                            ref_y_ptr, ref_y_stride);
-
     // Note if it is the best so far.
     if (this_sad < best_sad) {
       best_sad = this_sad;
@@ -516,17 +525,32 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
   }
 }
 
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
-                                                   int ref_frame) {
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride) {
+  const int bw = b_width_log2_lookup[plane_bsize];
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base) {
+  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
+                                             int ref_frame) {
   const VP9_COMMON *const cm = &cpi->common;
-  const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
-  return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
+  const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  return
+      (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ?
+          &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
 }
 
-int vp9_get_switchable_rate(const VP9_COMP *cpi) {
-  const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   return SWITCHABLE_INTERP_RATE_FACTOR *
              cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
@@ -557,10 +581,6 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
-  // Adjust threshold only in real time mode, which only uses last
-  // reference frame.
-  rd->thresh_mult[THR_NEWMV] += sf->elevate_newmv_thresh;
-
   rd->thresh_mult[THR_NEARMV] += 1000;
   rd->thresh_mult[THR_NEARA] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
@@ -591,24 +611,34 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
 }
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
+  static const int thresh_mult[2][MAX_REFS] =
+      {{2500, 2500, 2500, 4500, 4500, 2500},
+       {2000, 2000, 2000, 4000, 4000, 2000}};
   RD_OPT *const rd = &cpi->rd;
-  int i;
-
-  for (i = 0; i < MAX_REFS; ++i)
-    rd->thresh_mult_sub8x8[i] = cpi->oxcf.mode == BEST ? -500 : 0;
-
-  rd->thresh_mult_sub8x8[THR_LAST] += 2500;
-  rd->thresh_mult_sub8x8[THR_GOLD] += 2500;
-  rd->thresh_mult_sub8x8[THR_ALTR] += 2500;
-  rd->thresh_mult_sub8x8[THR_INTRA] += 2500;
-  rd->thresh_mult_sub8x8[THR_COMP_LA] += 4500;
-  rd->thresh_mult_sub8x8[THR_COMP_GA] += 4500;
+  const int idx = cpi->oxcf.mode == BEST;
+  memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
+}
 
-  // Check for masked out split cases.
-  for (i = 0; i < MAX_REFS; ++i)
-    if (sf->disable_split_mask & (1 << i))
-      rd->thresh_mult_sub8x8[i] = INT_MAX;
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index) {
+  if (rd_thresh > 0) {
+    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
+    int mode;
+    for (mode = 0; mode < top_mode; ++mode) {
+      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
+      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
+      BLOCK_SIZE bs;
+      for (bs = min_size; bs <= max_size; ++bs) {
+        int *const fact = &factor_buf[bs][mode];
+        if (mode == best_mode_index) {
+          *fact -= (*fact >> 4);
+        } else {
+          *fact = MIN(*fact + RD_THRESH_INC,
+                      rd_thresh * RD_THRESH_MAX_FACT);
+        }
+      }
+    }
+  }
 }
 
 int vp9_get_intra_cost_penalty(int qindex, int qdelta,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
index 1aa52663a60..4d247342b0a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rd.h
@@ -36,6 +36,9 @@ extern "C" {
 #define MAX_MODES 30
 #define MAX_REFS  6
 
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC      1
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
@@ -98,20 +101,12 @@ typedef struct RD_OPT {
   int thresh_mult_sub8x8[MAX_REFS];
 
   int threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES];
-  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-
-  int mode_map[BLOCK_SIZES][MAX_MODES];
 
-  int64_t comp_pred_diff[REFERENCE_MODES];
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
-  int64_t tx_select_diff[TX_MODES];
   // TODO(agrange): can this overflow?
   int tx_select_threshes[MAX_REF_FRAMES][TX_MODES];
 
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t mask_filter;
 
   int RDMULT;
   int RDDIV;
@@ -129,6 +124,7 @@ void vp9_rd_cost_reset(RD_COST *rd_cost);
 void vp9_rd_cost_init(RD_COST *rd_cost);
 
 struct TileInfo;
+struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
@@ -136,16 +132,23 @@ int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
 
 void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
 
-void vp9_initialize_me_consts(struct VP9_COMP *cpi, int qindex);
+void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
                                   unsigned int qstep, int *rate,
                                   int64_t *dist);
 
-int vp9_get_switchable_rate(const struct VP9_COMP *cpi);
+int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
+                            const MACROBLOCKD *const xd);
+
+int vp9_raster_block_offset(BLOCK_SIZE plane_bsize,
+                            int raster_block, int stride);
 
-const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
-                                                   int ref_frame);
+int16_t* vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                       int raster_block, int16_t *base);
+
+YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const struct VP9_COMP *cpi,
+                                             int ref_frame);
 
 void vp9_init_me_luts();
 
@@ -158,6 +161,9 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
+void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index);
+
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
     return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
index eca8e588092..73825623748 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.c
@@ -37,9 +37,7 @@
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_variance.h"
-
-#define RD_THRESH_MAX_FACT 64
-#define RD_THRESH_INC      1
+#include "vp9/encoder/vp9_aq_variance.h"
 
 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
                                  (1 << INTRA_FRAME))
@@ -51,6 +49,7 @@
 #define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
 
 #define MIN_EARLY_TERM_INDEX    3
+#define NEW_MV_DISCOUNT_FACTOR  8
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -78,6 +77,7 @@ struct rdcost_block_args {
   const scan_order *so;
 };
 
+#define LAST_NEW_MV_INDEX 6
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {NEARESTMV, {LAST_FRAME,   NONE}},
   {NEARESTMV, {ALTREF_FRAME, NONE}},
@@ -129,19 +129,6 @@ static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
   {{INTRA_FRAME,  NONE}},
 };
 
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2_lookup[plane_bsize];
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                            int m, int n, int min_plane, int max_plane) {
   int i;
@@ -177,7 +164,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
   int i;
   int64_t rate_sum = 0;
   int64_t dist_sum = 0;
-  const int ref = xd->mi[0].src_mi->mbmi.ref_frame[0];
+  const int ref = xd->mi[0]->mbmi.ref_frame[0];
   unsigned int sse;
   unsigned int var = 0;
   unsigned int sum_sse = 0;
@@ -268,15 +255,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
     } else {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+        vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                      pd->dequant[1] >> (xd->bd - 5),
                                      &rate, &dist);
       } else {
-        vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+        vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                      pd->dequant[1] >> 3, &rate, &dist);
       }
 #else
-      vp9_model_rd_from_var_lapndz(sum_sse, 1 << num_pels_log2_lookup[bs],
+      vp9_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
                                    pd->dequant[1] >> 3, &rate, &dist);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       rate_sum += rate;
@@ -305,6 +292,18 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   return error;
 }
 
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}
 
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
@@ -341,14 +340,14 @@ static const int16_t band_counts[TX_SIZES][8] = {
   { 1, 2, 3, 4, 11,  256 - 21, 0 },
   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
-static INLINE int cost_coeffs(MACROBLOCK *x,
-                              int plane, int block,
-                              ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
-                              TX_SIZE tx_size,
-                              const int16_t *scan, const int16_t *nb,
-                              int use_fast_coef_costing) {
+static int cost_coeffs(MACROBLOCK *x,
+                       int plane, int block,
+                       ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+                       TX_SIZE tx_size,
+                       const int16_t *scan, const int16_t *nb,
+                       int use_fast_coef_costing) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const struct macroblock_plane *p = &x->plane[plane];
   const struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
@@ -360,6 +359,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
   uint8_t token_cache[32 * 32];
   int pt = combine_entropy_contexts(*A, *L);
   int c, cost;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
+#else
+  const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
+#endif
+
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi, pd) == tx_size);
@@ -373,23 +378,29 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
 
     // dc token
     int v = qcoeff[0];
-    int prev_t = vp9_dct_value_tokens_ptr[v].token;
-    cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+    int16_t prev_t;
+    EXTRABIT e;
+    vp9_get_token_extra(v, &prev_t, &e);
+    cost = (*token_costs)[0][pt][prev_t] +
+        vp9_get_cost(prev_t, e, cat6_high_cost);
+
     token_cache[0] = vp9_pt_energy_class[prev_t];
     ++token_costs;
 
     // ac tokens
     for (c = 1; c < eob; c++) {
       const int rc = scan[c];
-      int t;
+      int16_t t;
 
       v = qcoeff[rc];
-      t = vp9_dct_value_tokens_ptr[v].token;
+      vp9_get_token_extra(v, &t, &e);
       if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
+        cost += (*token_costs)[!prev_t][!prev_t][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
       } else {
         pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
+        cost += (*token_costs)[!prev_t][pt][t] +
+            vp9_get_cost(t, e, cat6_high_cost);
         token_cache[rc] = vp9_pt_energy_class[t];
       }
       prev_t = t;
@@ -441,7 +452,7 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   args->sse  = this_sse >> shift;
 
-  if (x->skip_encode && !is_inter_block(&xd->mi[0].src_mi->mbmi)) {
+  if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> (shift + 2);
@@ -471,14 +482,15 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int64_t rd1, rd2, rd;
 
   if (args->skip)
     return;
 
   if (!is_inter_block(mbmi)) {
-    vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
+    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+    vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &arg);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       dist_block(plane, block, tx_size, args, xd->bd);
@@ -509,8 +521,9 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
       args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       args->dist = args->sse;
       if (x->plane[plane].eobs[block]) {
-        int64_t dc_correct = coeff[0] * coeff[0] -
-            (coeff[0] - dqcoeff[0]) * (coeff[0] - dqcoeff[0]);
+        const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
+        const int64_t resd_sse = coeff[0] - dqcoeff[0];
+        int64_t dc_correct = orig_sse - resd_sse * resd_sse;
 #if CONFIG_VP9_HIGHBITDEPTH
         dc_correct >>= ((xd->bd - 8) * 2);
 #endif
@@ -575,7 +588,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   args.use_fast_coef_costing = use_fast_coef_casting;
 
   if (plane == 0)
-    xd->mi[0].src_mi->mbmi.tx_size = tx_size;
+    xd->mi[0]->mbmi.tx_size = tx_size;
 
   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
@@ -605,7 +618,7 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *const cm = &cpi->common;
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
 
   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
 
@@ -625,7 +638,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int r[TX_SIZES][2], s[TX_SIZES];
   int64_t d[TX_SIZES], sse[TX_SIZES];
@@ -639,7 +652,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_rd = INT64_MAX;
   TX_SIZE best_tx = max_tx_size;
 
-  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
@@ -712,10 +725,10 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   int64_t sse;
   int64_t *ret_sse = psse ? psse : &sse;
 
-  assert(bs == xd->mi[0].src_mi->mbmi.sb_type);
+  assert(bs == xd->mi[0]->mbmi.sb_type);
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
-    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
+    memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
                            bs);
   } else {
@@ -760,10 +773,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
-                                                            src_stride)];
-  uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
-                                                       dst_stride)];
+  const uint8_t *src_init = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+                                                                src_stride)];
+  uint8_t *dst_init = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, ib,
+                                                           dst_stride)];
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
 
@@ -777,9 +790,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
 
   assert(ib < 4);
 
-  vpx_memcpy(ta, a, sizeof(ta));
-  vpx_memcpy(tl, l, sizeof(tl));
-  xd->mi[0].src_mi->mbmi.tx_size = TX_4X4;
+  memcpy(ta, a, sizeof(ta));
+  memcpy(tl, l, sizeof(tl));
+  xd->mi[0]->mbmi.tx_size = TX_4X4;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -799,18 +812,19 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
             continue;
       }
 
-      vpx_memcpy(tempa, ta, sizeof(ta));
-      vpx_memcpy(templ, tl, sizeof(tl));
+      memcpy(tempa, ta, sizeof(ta));
+      memcpy(templ, tl, sizeof(tl));
 
       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
           const int block = ib + idy * 2 + idx;
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-          int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
-                                                              p->src_diff);
+          int16_t *const src_diff = vp9_raster_block_offset_int16(BLOCK_8X8,
+                                                                  block,
+                                                                  p->src_diff);
           tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
-          xd->mi[0].src_mi->bmi[block].as_mode = mode;
+          xd->mi[0]->bmi[block].as_mode = mode;
           vp9_predict_intra_block(xd, block, 1,
                                   TX_4X4, mode,
                                   x->skip_encode ? src : dst,
@@ -859,12 +873,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         *bestdistortion = distortion;
         best_rd = this_rd;
         *best_mode = mode;
-        vpx_memcpy(a, tempa, sizeof(tempa));
-        vpx_memcpy(l, templ, sizeof(templ));
+        memcpy(a, tempa, sizeof(tempa));
+        memcpy(l, templ, sizeof(templ));
         for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
-          vpx_memcpy(best_dst16 + idy * 8,
-                     CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
-                     num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+          memcpy(best_dst16 + idy * 8,
+                 CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));
         }
       }
     next_highbd:
@@ -874,9 +888,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       return best_rd;
 
     for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
-      vpx_memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
-                 best_dst16 + idy * 8,
-                 num_4x4_blocks_wide * 4 * sizeof(uint16_t));
+      memcpy(CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
+             best_dst16 + idy * 8,
+             num_4x4_blocks_wide * 4 * sizeof(uint16_t));
     }
 
     return best_rd;
@@ -899,18 +913,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
           continue;
     }
 
-    vpx_memcpy(tempa, ta, sizeof(ta));
-    vpx_memcpy(templ, tl, sizeof(tl));
+    memcpy(tempa, ta, sizeof(ta));
+    memcpy(templ, tl, sizeof(tl));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
         const int block = ib + idy * 2 + idx;
         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
-        int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
-                                                            p->src_diff);
+        int16_t *const src_diff =
+            vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
         tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
-        xd->mi[0].src_mi->bmi[block].as_mode = mode;
+        xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
@@ -957,11 +971,11 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       *bestdistortion = distortion;
       best_rd = this_rd;
       *best_mode = mode;
-      vpx_memcpy(a, tempa, sizeof(tempa));
-      vpx_memcpy(l, templ, sizeof(templ));
+      memcpy(a, tempa, sizeof(tempa));
+      memcpy(l, templ, sizeof(templ));
       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
-        vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
-                   num_4x4_blocks_wide * 4);
+        memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+               num_4x4_blocks_wide * 4);
     }
   next:
     {}
@@ -971,8 +985,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     return best_rd;
 
   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
-    vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
-               num_4x4_blocks_wide * 4);
+    memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+           num_4x4_blocks_wide * 4);
 
   return best_rd;
 }
@@ -983,10 +997,10 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
                                             int64_t best_rd) {
   int i, j;
   const MACROBLOCKD *const xd = &mb->e_mbd;
-  MODE_INFO *const mic = xd->mi[0].src_mi;
-  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
-  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
-  const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
+  MODE_INFO *const mic = xd->mi[0];
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
@@ -997,8 +1011,8 @@ static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
   ENTROPY_CONTEXT t_above[4], t_left[4];
   const int *bmode_costs = cpi->mbmode_cost;
 
-  vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
+  memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
+  memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
 
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
@@ -1054,14 +1068,14 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   PREDICTION_MODE mode;
   PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mic = xd->mi[0].src_mi;
+  MODE_INFO *const mic = xd->mi[0];
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
   int i;
   int *bmode_costs;
-  const MODE_INFO *above_mi = xd->mi[-xd->mi_stride].src_mi;
-  const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1].src_mi : NULL;
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
   const PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
   bmode_costs = cpi->y_mode_costs[A][L];
@@ -1070,10 +1084,20 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     for (i = 0; i < TX_MODES; i++)
       tx_cache[i] = INT64_MAX;
 
-  vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+  memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
   /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
     int64_t local_tx_cache[TX_MODES];
+
+    if (cpi->sf.use_nonrd_pick_mode) {
+      // These speed features are turned on in hybrid non-RD and RD mode
+      // for key frame coding in the context of real-time setting.
+      if (conditional_skipintra(mode, mode_selected))
+          continue;
+      if (*skippable)
+        break;
+    }
+
     mic->mbmi.mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
@@ -1119,7 +1143,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
                             int64_t *sse, BLOCK_SIZE bsize,
                             int64_t ref_best_rd) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const TX_SIZE uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
   int plane;
   int pnrate = 0, pnskip = 1;
@@ -1177,12 +1201,12 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
-  vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+  memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    xd->mi[0].src_mi->mbmi.uv_mode = mode;
+    xd->mi[0]->mbmi.uv_mode = mode;
 
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
                           &this_distortion, &s, &this_sse, bsize, best_rd))
@@ -1203,7 +1227,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  xd->mi[0].src_mi->mbmi.uv_mode = mode_selected;
+  xd->mi[0]->mbmi.uv_mode = mode_selected;
   return best_rd;
 }
 
@@ -1214,21 +1238,20 @@ static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
   const VP9_COMMON *cm = &cpi->common;
   int64_t unused;
 
-  x->e_mbd.mi[0].src_mi->mbmi.uv_mode = DC_PRED;
-  vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+  x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
+  memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
   *rate = *rate_tokenonly + cpi->intra_uv_mode_cost[cm->frame_type][DC_PRED];
   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
-static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+static void choose_intra_uv_mode(VP9_COMP *cpi, MACROBLOCK *const x,
+                                 PICK_MODE_CONTEXT *ctx,
                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  PREDICTION_MODE *mode_uv) {
-  MACROBLOCK *const x = &cpi->mb;
-
   // Use an estimated rd for uv_intra based on DC_PRED if the
   // appropriate speed flag is set.
   if (cpi->sf.use_uv_intra_rd_estimate) {
@@ -1241,7 +1264,7 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   }
-  *mode_uv = x->e_mbd.mi[0].src_mi->mbmi.uv_mode;
+  *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
 }
 
 static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
@@ -1250,20 +1273,13 @@ static int cost_mv_ref(const VP9_COMP *cpi, PREDICTION_MODE mode,
   return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
 }
 
-static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize,
-                                int_mv *frame_mv,
-                                int mi_row, int mi_col,
-                                int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv);
-
 static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
                                 PREDICTION_MODE mode, int_mv this_mv[2],
                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
                                 int_mv seg_mvs[MAX_REF_FRAMES],
                                 int_mv *best_ref_mv[2], const int *mvjcost,
                                 int *mvcost[2]) {
-  MODE_INFO *const mic = xd->mi[0].src_mi;
+  MODE_INFO *const mic = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mic->mbmi;
   int thismvcost = 0;
   int idx, idy;
@@ -1305,8 +1321,7 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
 
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
-      vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
-                 &mic->bmi[i], sizeof(mic->bmi[i]));
+      memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
 
   return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
             thismvcost;
@@ -1325,16 +1340,16 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   MACROBLOCKD *xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
-  MODE_INFO *const mi = xd->mi[0].src_mi;
+  MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
 
-  const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
-                                                             p->src.stride)];
-  uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
-                                                        pd->dst.stride)];
+  const uint8_t *const src =
+      &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                            pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -1342,7 +1357,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+    const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
                                                pd->pre[ref].stride)];
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1376,17 +1391,17 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vp9_highbd_subtract_block(
-        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-        src, p->src.stride, dst, pd->dst.stride, xd->bd);
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
   } else {
     vp9_subtract_block(
-        height, width, raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-        src, p->src.stride, dst, pd->dst.stride);
+        height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+        8, src, p->src.stride, dst, pd->dst.stride);
   }
 #else
   vp9_subtract_block(height, width,
-                     raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
-                     src, p->src.stride, dst, pd->dst.stride);
+                     vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
+                     8, src, p->src.stride, dst, pd->dst.stride);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   k = i;
@@ -1397,7 +1412,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
 
       k += (idy * 2 + idx);
       coeff = BLOCK_OFFSET(p->coeff, k);
-      x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+      x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -1466,22 +1481,23 @@ static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
 }
 
 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
-  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
-  p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
+  p->src.buf = &p->src.buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                   p->src.stride)];
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
-  pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
-                                                       pd->pre[0].stride)];
+  pd->pre[0].buf = &pd->pre[0].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[0].stride)];
   if (has_second_ref(mbmi))
-    pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
-                                                         pd->pre[1].stride)];
+    pd->pre[1].buf = &pd->pre[1].buf[vp9_raster_block_offset(BLOCK_8X8, i,
+                                                           pd->pre[1].stride)];
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
                                   struct buf_2d orig_pre[2]) {
-  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
   x->plane[0].src = orig_src;
   x->e_mbd.plane[0].pre[0] = orig_pre[0];
   if (has_second_ref(mbmi))
@@ -1529,6 +1545,190 @@ static int check_best_zero_mv(
   return 1;
 }
 
+static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                BLOCK_SIZE bsize,
+                                int_mv *frame_mv,
+                                int mi_row, int mi_col,
+                                int_mv single_newmv[MAX_REF_FRAMES],
+                                int *rate_mv) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int refs[2] = {mbmi->ref_frame[0],
+                       mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
+  int_mv ref_mv[2];
+  int ite, ref;
+  const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
+  struct scale_factors sf;
+
+  // Do joint motion search in compound mode to get more accurate mv.
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  int last_besterr[2] = {INT_MAX, INT_MAX};
+  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+  };
+
+  // Prediction buffer from second frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  uint8_t *second_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  for (ref = 0; ref < 2; ++ref) {
+    ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
+
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+                           NULL);
+    }
+
+    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+  }
+
+  // Since we have scaled the reference frames to match the size of the current
+  // frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Allow joint search multiple times iteratively for each reference frame
+  // and break out of the search loop if it couldn't find a better mv.
+  for (ite = 0; ite < 4; ite++) {
+    struct buf_2d ref_yv12[2];
+    int bestsme = INT_MAX;
+    int sadpb = x->sadperbit16;
+    MV tmp_mv;
+    int search_range = 3;
+
+    int tmp_col_min = x->mv_col_min;
+    int tmp_col_max = x->mv_col_max;
+    int tmp_row_min = x->mv_row_min;
+    int tmp_row_max = x->mv_row_max;
+    int id = ite % 2;  // Even iterations search in the first reference frame,
+                       // odd iterations search in the second. The predictor
+                       // found for the 'other' reference frame is factored in.
+
+    // Initialized here because of compiler problem in Visual Studio.
+    ref_yv12[0] = xd->plane[0].pre[0];
+    ref_yv12[1] = xd->plane[0].pre[1];
+
+    // Get the prediction block from the 'other' reference frame.
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+      vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
+                                       ref_yv12[!id].stride,
+                                       second_pred, pw,
+                                       &frame_mv[refs[!id]].as_mv,
+                                       &sf, pw, ph, 0,
+                                       kernel, MV_PRECISION_Q3,
+                                       mi_col * MI_SIZE, mi_row * MI_SIZE,
+                                       xd->bd);
+    } else {
+      second_pred = (uint8_t *)second_pred_alloc_16;
+      vp9_build_inter_predictor(ref_yv12[!id].buf,
+                                ref_yv12[!id].stride,
+                                second_pred, pw,
+                                &frame_mv[refs[!id]].as_mv,
+                                &sf, pw, ph, 0,
+                                kernel, MV_PRECISION_Q3,
+                                mi_col * MI_SIZE, mi_row * MI_SIZE);
+    }
+#else
+    vp9_build_inter_predictor(ref_yv12[!id].buf,
+                              ref_yv12[!id].stride,
+                              second_pred, pw,
+                              &frame_mv[refs[!id]].as_mv,
+                              &sf, pw, ph, 0,
+                              kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    // Do compound motion search on the current reference frame.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[id];
+    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
+
+    // Use the mv result from the single mode as mv predictor.
+    tmp_mv = frame_mv[refs[id]].as_mv;
+
+    tmp_mv.col >>= 3;
+    tmp_mv.row >>= 3;
+
+    // Small-range full-pixel motion search.
+    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+                                       search_range,
+                                       &cpi->fn_ptr[bsize],
+                                       &ref_mv[id].as_mv, second_pred);
+    if (bestsme < INT_MAX)
+      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
+                                      second_pred, &cpi->fn_ptr[bsize], 1);
+
+    x->mv_col_min = tmp_col_min;
+    x->mv_col_max = tmp_col_max;
+    x->mv_row_min = tmp_row_min;
+    x->mv_row_max = tmp_row_max;
+
+    if (bestsme < INT_MAX) {
+      int dis; /* TODO: use dis in distortion calculation later. */
+      unsigned int sse;
+      bestsme = cpi->find_fractional_mv_step(
+          x, &tmp_mv,
+          &ref_mv[id].as_mv,
+          cpi->common.allow_high_precision_mv,
+          x->errorperbit,
+          &cpi->fn_ptr[bsize],
+          0, cpi->sf.mv.subpel_iters_per_step,
+          NULL,
+          x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred,
+          pw, ph);
+    }
+
+    // Restore the pointer to the first (possibly scaled) prediction buffer.
+    if (id)
+      xd->plane[0].pre[0] = ref_yv12[0];
+
+    if (bestsme < last_besterr[id]) {
+      frame_mv[refs[id]].as_mv = tmp_mv;
+      last_besterr[id] = bestsme;
+    } else {
+      break;
+    }
+  }
+
+  *rate_mv = 0;
+
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // Restore the prediction frame pointers to their unscaled versions.
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+
+    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                &mbmi->ref_mvs[refs[ref]][0].as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+  }
+}
+
 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                         const TileInfo * const tile,
                                         int_mv *best_ref_mv,
@@ -1544,7 +1744,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int i;
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   MACROBLOCKD *xd = &x->e_mbd;
-  MODE_INFO *mi = xd->mi[0].src_mi;
+  MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
   int mode_idx;
   int k, br = 0, idx, idy;
@@ -1576,8 +1776,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < 4; i++)
     bsi->modes[i] = ZEROMV;
 
-  vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
-  vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
+  memcpy(t_above, pd->above_context, sizeof(t_above));
+  memcpy(t_left, pd->left_context, sizeof(t_left));
 
   // 64 makes this threshold really big effectively
   // making it so that we very rarely check mvs on
@@ -1619,11 +1819,11 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                 this_mode, mbmi->ref_frame))
           continue;
 
-        vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
-        vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
-                   sizeof(bsi->rdstat[i][mode_idx].ta));
-        vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
-                   sizeof(bsi->rdstat[i][mode_idx].tl));
+        memcpy(orig_pre, pd->pre, sizeof(orig_pre));
+        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[i][mode_idx].ta));
+        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[i][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
@@ -1799,8 +1999,8 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
           if (!subpelmv && have_ref &&
               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-            vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
-                       sizeof(SEG_RDSTAT));
+            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
+                   sizeof(SEG_RDSTAT));
             if (num_4x4_blocks_wide > 1)
               bsi->rdstat[i + 1][mode_idx].eobs =
                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
@@ -1848,12 +2048,12 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
-        return INT64_MAX;;
+        return INT64_MAX;
       }
 
       mode_idx = INTER_OFFSET(mode_selected);
-      vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
-      vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
 
       set_and_cost_bmi_mvs(cpi, xd, i, mode_selected, mode_mv[mode_selected],
                            frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
@@ -1871,7 +2071,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
-        return INT64_MAX;;
+        return INT64_MAX;
       }
     }
   } /* for each label */
@@ -1920,8 +2120,8 @@ static void estimate_ref_frame_costs(const VP9_COMMON *cm,
   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
                                              SEG_LVL_REF_FRAME);
   if (seg_ref_active) {
-    vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
-    vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
+    memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
+    memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
     vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
@@ -1985,14 +2185,14 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->skip = x->skip;
   ctx->skippable = skippable;
   ctx->best_mode_index = mode_index;
-  ctx->mic = *xd->mi[0].src_mi;
+  ctx->mic = *xd->mi[0];
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 
-  vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
-  vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
-             sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
+  memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
+  memcpy(ctx->best_filter_diff, best_filter_diff,
+         sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2006,16 +2206,19 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   const VP9_COMMON *cm = &cpi->common;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *const mi = xd->mi[0].src_mi;
+  MODE_INFO *const mi = xd->mi[0];
   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
 
+  assert(yv12 != NULL);
+
   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   // use the UV scaling factors.
   vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
+  vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col,
+                   NULL, NULL);
 
   // Candidate refinement carried out at encoder and decoder
   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
@@ -2036,7 +2239,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                  int_mv *tmp_mv, int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const VP9_COMMON *cm = &cpi->common;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
   int bestsme = INT_MAX;
   int step_param;
@@ -2093,24 +2296,27 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->sf.adaptive_motion_search) {
     int bwl = b_width_log2_lookup[bsize];
     int bhl = b_height_log2_lookup[bsize];
-    int i;
     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
 
     if (tlevel < 5)
       step_param += 2;
 
-    for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
-      if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-        x->pred_mv[ref].row = 0;
-        x->pred_mv[ref].col = 0;
-        tmp_mv->as_int = INVALID_MV;
-
-        if (scaled_ref_frame) {
-          int i;
-          for (i = 0; i < MAX_MB_PLANE; i++)
-            xd->plane[i].pre[0] = backup_yv12[i];
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[0] = backup_yv12[i];
+          }
+          return;
         }
-        return;
       }
     }
   }
@@ -2154,189 +2360,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 }
 
-static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                                BLOCK_SIZE bsize,
-                                int_mv *frame_mv,
-                                int mi_row, int mi_col,
-                                int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
-  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
-  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
-  MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
-  const int refs[2] = { mbmi->ref_frame[0],
-                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
-  int_mv ref_mv[2];
-  int ite, ref;
-  // Prediction buffer from second frame.
-#if CONFIG_VP9_HIGHBITDEPTH
-  uint8_t *second_pred;
-  uint8_t *second_pred_alloc;
-#else
-  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-  const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
-
-  // Do joint motion search in compound mode to get more accurate mv.
-  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
-  struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
-  int last_besterr[2] = {INT_MAX, INT_MAX};
-  const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
-    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
-    vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
-  };
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
-    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
-  } else {
-    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-    second_pred = second_pred_alloc;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  for (ref = 0; ref < 2; ++ref) {
-    ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
-
-    if (scaled_ref_frame[ref]) {
-      int i;
-      // Swap out the reference frame for a version that's been scaled to
-      // match the resolution of the current frame, allowing the existing
-      // motion search code to be used without additional modifications.
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        backup_yv12[ref][i] = xd->plane[i].pre[ref];
-      vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
-                           NULL);
-    }
-
-    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
-  }
-
-  // Allow joint search multiple times iteratively for each ref frame
-  // and break out the search loop if it couldn't find better mv.
-  for (ite = 0; ite < 4; ite++) {
-    struct buf_2d ref_yv12[2];
-    int bestsme = INT_MAX;
-    int sadpb = x->sadperbit16;
-    MV tmp_mv;
-    int search_range = 3;
-
-    int tmp_col_min = x->mv_col_min;
-    int tmp_col_max = x->mv_col_max;
-    int tmp_row_min = x->mv_row_min;
-    int tmp_row_max = x->mv_row_max;
-    int id = ite % 2;
 
-    // Initialized here because of compiler problem in Visual Studio.
-    ref_yv12[0] = xd->plane[0].pre[0];
-    ref_yv12[1] = xd->plane[0].pre[1];
-
-    // Get pred block from second frame.
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
-                                       ref_yv12[!id].stride,
-                                       second_pred, pw,
-                                       &frame_mv[refs[!id]].as_mv,
-                                       &xd->block_refs[!id]->sf,
-                                       pw, ph, 0,
-                                       kernel, MV_PRECISION_Q3,
-                                       mi_col * MI_SIZE, mi_row * MI_SIZE,
-                                       xd->bd);
-    } else {
-      vp9_build_inter_predictor(ref_yv12[!id].buf,
-                                ref_yv12[!id].stride,
-                                second_pred, pw,
-                                &frame_mv[refs[!id]].as_mv,
-                                &xd->block_refs[!id]->sf,
-                                pw, ph, 0,
-                                kernel, MV_PRECISION_Q3,
-                                mi_col * MI_SIZE, mi_row * MI_SIZE);
-    }
-#else
-    vp9_build_inter_predictor(ref_yv12[!id].buf,
-                              ref_yv12[!id].stride,
-                              second_pred, pw,
-                              &frame_mv[refs[!id]].as_mv,
-                              &xd->block_refs[!id]->sf,
-                              pw, ph, 0,
-                              kernel, MV_PRECISION_Q3,
-                              mi_col * MI_SIZE, mi_row * MI_SIZE);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    // Compound motion search on first ref frame.
-    if (id)
-      xd->plane[0].pre[0] = ref_yv12[id];
-    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
-
-    // Use mv result from single mode as mvp.
-    tmp_mv = frame_mv[refs[id]].as_mv;
-
-    tmp_mv.col >>= 3;
-    tmp_mv.row >>= 3;
-
-    // Small-range full-pixel motion search
-    bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                       search_range,
-                                       &cpi->fn_ptr[bsize],
-                                       &ref_mv[id].as_mv, second_pred);
-    if (bestsme < INT_MAX)
-      bestsme = vp9_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
-                                      second_pred, &cpi->fn_ptr[bsize], 1);
-
-    x->mv_col_min = tmp_col_min;
-    x->mv_col_max = tmp_col_max;
-    x->mv_row_min = tmp_row_min;
-    x->mv_row_max = tmp_row_max;
-
-    if (bestsme < INT_MAX) {
-      int dis; /* TODO: use dis in distortion calculation later. */
-      unsigned int sse;
-      bestsme = cpi->find_fractional_mv_step(
-          x, &tmp_mv,
-          &ref_mv[id].as_mv,
-          cpi->common.allow_high_precision_mv,
-          x->errorperbit,
-          &cpi->fn_ptr[bsize],
-          0, cpi->sf.mv.subpel_iters_per_step,
-          NULL,
-          x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred,
-          pw, ph);
-    }
-
-    if (id)
-      xd->plane[0].pre[0] = scaled_first_yv12;
-
-    if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_mv = tmp_mv;
-      last_besterr[id] = bestsme;
-    } else {
-      break;
-    }
-  }
-
-  *rate_mv = 0;
-
-  for (ref = 0; ref < 2; ++ref) {
-    if (scaled_ref_frame[ref]) {
-      // restore the predictor
-      int i;
-      for (i = 0; i < MAX_MB_PLANE; i++)
-        xd->plane[i].pre[ref] = backup_yv12[ref][i];
-    }
-
-    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                &mbmi->ref_mvs[refs[ref]][0].as_mv,
-                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  vpx_free(second_pred_alloc);
-#else
-  vpx_free(second_pred);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-}
 
 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
                                    uint8_t *orig_dst[MAX_MB_PLANE],
@@ -2348,6 +2372,27 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
   }
 }
 
+// In some situations we want to discount tha pparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+static int discount_newmv_test(const VP9_COMP *cpi,
+                               int this_mode,
+                               int_mv this_mv,
+                               int_mv (*mode_mv)[MAX_REF_FRAMES],
+                               int ref_frame) {
+  return (!cpi->rc.is_src_frame_alt_ref &&
+          (this_mode == NEWMV) &&
+          (this_mv.as_int != 0) &&
+          ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
+          ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
+           (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
                                  int64_t txfm_cache[],
@@ -2361,11 +2406,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
                                  int (*single_skippable)[MAX_REF_FRAMES],
                                  int64_t *psse,
-                                 const int64_t ref_best_rd) {
+                                 const int64_t ref_best_rd,
+                                 int64_t *mask_filter,
+                                 int64_t filter_cache[]) {
   VP9_COMMON *cm = &cpi->common;
-  RD_OPT *rd_opt = &cpi->rd;
   MACROBLOCKD *xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   const int is_comp_pred = has_second_ref(mbmi);
   const int this_mode = mbmi->mode;
   int_mv *frame_mv = mode_mv[this_mode];
@@ -2374,11 +2420,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
+  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
   uint8_t *tmp_buf;
 #else
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   int pred_exists = 0;
   int intpel_mv;
@@ -2404,16 +2449,16 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
   } else {
-    tmp_buf = tmp_buf8;
+    tmp_buf = (uint8_t *)tmp_buf16;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   if (pred_filter_search) {
     INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
     if (xd->up_available)
-      af = xd->mi[-xd->mi_stride].src_mi->mbmi.interp_filter;
+      af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
     if (xd->left_available)
-      lf = xd->mi[-1].src_mi->mbmi.interp_filter;
+      lf = xd->mi[-1]->mbmi.interp_filter;
 
     if ((this_mode != NEWMV) || (af == lf))
       best_filter = af;
@@ -2456,10 +2501,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                            &tmp_mv, &rate_mv);
       if (tmp_mv.as_int == INVALID_MV)
         return INT64_MAX;
-      *rate2 += rate_mv;
+
       frame_mv[refs[0]].as_int =
-          xd->mi[0].src_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+          xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
       single_newmv[refs[0]].as_int = tmp_mv.as_int;
+
+      // Estimate the rate implications of a new mv but discount this
+      // under certain circumstances where we want to help initiate a weak
+      // motion field, where the distortion gain for a single block may not
+      // be enough to overcome the cost of a new mv.
+      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+        *rate2 += MAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+      } else {
+        *rate2 += rate_mv;
+      }
     }
   }
 
@@ -2484,11 +2539,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     orig_dst_stride[i] = xd->plane[i].dst.stride;
   }
 
-  /* We don't include the cost of the second reference here, because there
-   * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
-   * words if you present them in that order, the second one is always known
-   * if the first is known */
-  *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+  // We don't include the cost of the second reference here, because there
+  // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
+  // words if you present them in that order, the second one is always known
+  // if the first is known.
+  //
+  // Under some circumstances we discount the cost of new mv mode to encourage
+  // initiation of a motion field.
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
+                          mode_mv, refs[0])) {
+    *rate2 += MIN(cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]),
+                  cost_mv_ref(cpi, NEARESTMV, mbmi->mode_context[refs[0]]));
+  } else {
+    *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
+  }
 
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
       mbmi->mode != NEARESTMV)
@@ -2502,9 +2566,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
-  rd_opt->mask_filter = 0;
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    rd_opt->filter_cache[i] = INT64_MAX;
+    filter_cache[i] = INT64_MAX;
 
   if (cm->interp_filter != BILINEAR) {
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
@@ -2521,17 +2584,17 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         int64_t tmp_skip_sse = INT64_MAX;
 
         mbmi->interp_filter = i;
-        rs = vp9_get_switchable_rate(cpi);
+        rs = vp9_get_switchable_rate(cpi, xd);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
         if (i > 0 && intpel_mv) {
           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
-          rd_opt->filter_cache[i] = rd;
-          rd_opt->filter_cache[SWITCHABLE_FILTERS] =
-              MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
           if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
-          rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
+          *mask_filter = MAX(*mask_filter, rd);
         } else {
           int rate_sum = 0;
           int64_t dist_sum = 0;
@@ -2559,12 +2622,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                           &tmp_skip_sb, &tmp_skip_sse);
 
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
-          rd_opt->filter_cache[i] = rd;
-          rd_opt->filter_cache[SWITCHABLE_FILTERS] =
-              MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
+          filter_cache[i] = rd;
+          filter_cache[SWITCHABLE_FILTERS] =
+              MIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
           if (cm->interp_filter == SWITCHABLE)
             rd += rs_rd;
-          rd_opt->mask_filter = MAX(rd_opt->mask_filter, rd);
+          *mask_filter = MAX(*mask_filter, rd);
 
           if (i == 0 && intpel_mv) {
             tmp_rate_sum = rate_sum;
@@ -2595,8 +2658,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
 
           skip_txfm_sb = tmp_skip_sb;
           skip_sse_sb = tmp_skip_sse;
-          vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
-          vpx_memcpy(bsse, x->bsse, sizeof(bsse));
+          memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+          memcpy(bsse, x->bsse, sizeof(bsse));
         }
       }
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
@@ -2605,7 +2668,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   // Set the appropriate filter
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : best_filter;
-  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi) : 0;
+  rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
 
   if (pred_exists) {
     if (best_needs_copy) {
@@ -2626,8 +2689,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
                     &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
-    vpx_memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
-    vpx_memcpy(bsse, x->bsse, sizeof(bsse));
+    memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
+    memcpy(bsse, x->bsse, sizeof(bsse));
   }
 
   if (!is_comp_pred)
@@ -2637,7 +2700,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (is_comp_pred)
       if (single_skippable[this_mode][refs[0]] &&
           single_skippable[this_mode][refs[1]])
-        vpx_memset(skip_txfm, 1, sizeof(skip_txfm));
+        memset(skip_txfm, 1, sizeof(skip_txfm));
 
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
@@ -2651,8 +2714,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (cm->interp_filter == SWITCHABLE)
     *rate2 += rs;
 
-  vpx_memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
-  vpx_memcpy(x->bsse, bsse, sizeof(bsse));
+  memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
+  memcpy(x->bsse, bsse, sizeof(bsse));
 
   if (!skip_txfm_sb) {
     int skippable_y, skippable_uv;
@@ -2718,7 +2781,8 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   TX_SIZE max_uv_tx_size;
   x->skip_encode = 0;
   ctx->skip = 0;
-  xd->mi[0].src_mi->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+  xd->mi[0]->mbmi.ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -2735,7 +2799,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       return;
     }
   }
-  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0].src_mi->mbmi.tx_size, bsize,
+  max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
                                        pd[1].subsampling_x,
                                        pd[1].subsampling_y);
   rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
@@ -2761,43 +2825,82 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
   }
 
-  ctx->mic = *xd->mi[0].src_mi;
+  ctx->mic = *xd->mi[0];
   rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
 }
 
-static void update_rd_thresh_fact(VP9_COMP *cpi, int bsize,
-                                  int best_mode_index) {
-  if (cpi->sf.adaptive_rd_thresh > 0) {
-    const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
-    int mode;
-    for (mode = 0; mode < top_mode; ++mode) {
-      const BLOCK_SIZE min_size = MAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = MIN(bsize + 2, BLOCK_64X64);
-      BLOCK_SIZE bs;
-      for (bs = min_size; bs <= max_size; ++bs) {
-        int *const fact = &cpi->rd.thresh_freq_fact[bs][mode];
-        if (mode == best_mode_index) {
-          *fact -= (*fact >> 4);
-        } else {
-          *fact = MIN(*fact + RD_THRESH_INC,
-                      cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
-        }
-      }
-    }
+// This function is designed to apply a bias or adjustment to an rd value based
+// on the relative variance of the source and reconstruction.
+#define LOW_VAR_THRESH 16
+#define VLOW_ADJ_MAX 25
+#define VHIGH_ADJ_MAX 8
+static void rd_variance_adjustment(VP9_COMP *cpi,
+                                   MACROBLOCK *x,
+                                   BLOCK_SIZE bsize,
+                                   int64_t *this_rd,
+                                   MV_REFERENCE_FRAME ref_frame,
+                                   unsigned int source_variance) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  unsigned int recon_variance;
+  unsigned int absvar_diff = 0;
+  int64_t var_error = 0;
+  int64_t var_factor = 0;
+
+  if (*this_rd == INT64_MAX)
+    return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon_variance =
+      vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+  } else {
+    recon_variance =
+      vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  recon_variance =
+    vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
+    absvar_diff = (source_variance > recon_variance)
+      ? (source_variance - recon_variance)
+      : (recon_variance - source_variance);
+
+    var_error = (200 * source_variance * recon_variance) /
+      ((source_variance * source_variance) +
+       (recon_variance * recon_variance));
+    var_error = 100 - var_error;
+  }
+
+  // Source variance above a threshold and ref frame is intra.
+  // This case is targeted mainly at discouraging intra modes that give rise
+  // to a predictor with a low spatial complexity compared to the source.
+  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
+      (source_variance > recon_variance)) {
+    var_factor = MIN(absvar_diff, MIN(VLOW_ADJ_MAX, var_error));
+  // A second possible case of interest is where the source variance
+  // is very low and we wish to discourage false texture or motion trails.
+  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
+             (recon_variance > source_variance)) {
+    var_factor = MIN(absvar_diff, MIN(VHIGH_ADJ_MAX, var_error));
   }
+  *this_rd += (*this_rd * var_factor) / 100;
 }
 
-void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
-                               const TileInfo *const tile,
+void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi,
+                               TileDataEnc *tile_data,
+                               MACROBLOCK *x,
                                int mi_row, int mi_col,
                                RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx,
                                int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
@@ -2836,14 +2939,20 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
-  const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
+  const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
-  int *mode_map = rd_opt->mode_map[bsize];
+  int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+
   vp9_zero(best_mbmode);
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
@@ -2869,7 +2978,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile, ref_frame, bsize, mi_row, mi_col,
+      assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+      setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -2948,7 +3058,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   mode_skip_mask[INTRA_FRAME] |=
       ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
-  for (i = 0; i < MAX_MODES; ++i)
+  for (i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+    mode_threshold[i] = 0;
+  for (i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
     mode_threshold[i] = ((int64_t)rd_threshes[i] * rd_thresh_freq_fact[i]) >> 5;
 
   midx =  sf->schedule_mode_search ? mode_skip_start : 0;
@@ -3007,8 +3119,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
-        ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
       continue;
 
     if (mode_skip_mask[ref_frame] & (1 << this_mode))
@@ -3023,9 +3135,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     if (sf->motion_field_mode_search) {
       const int mi_width  = MIN(num_8x8_blocks_wide_lookup[bsize],
-                                tile->mi_col_end - mi_col);
+                                tile_info->mi_col_end - mi_col);
       const int mi_height = MIN(num_8x8_blocks_high_lookup[bsize],
-                                tile->mi_row_end - mi_row);
+                                tile_info->mi_row_end - mi_row);
       const int bsl = mi_width_log2_lookup[bsize];
       int cb_partition_search_ctrl = (((mi_row + mi_col) >> bsl)
           + get_chessboard_index(cm->current_video_frame)) & 0x1;
@@ -3036,24 +3148,24 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       int_mv ref_mv;
       ref_mv.as_int = INVALID_MV;
 
-      if ((mi_row - 1) >= tile->mi_row_start) {
-        ref_mv = xd->mi[-xd->mi_stride].src_mi->mbmi.mv[0];
-        rf = xd->mi[-xd->mi_stride].src_mi->mbmi.ref_frame[0];
+      if ((mi_row - 1) >= tile_info->mi_row_start) {
+        ref_mv = xd->mi[-xd->mi_stride]->mbmi.mv[0];
+        rf = xd->mi[-xd->mi_stride]->mbmi.ref_frame[0];
         for (i = 0; i < mi_width; ++i) {
-          ref_mbmi = &xd->mi[-xd->mi_stride + i].src_mi->mbmi;
+          ref_mbmi = &xd->mi[-xd->mi_stride + i]->mbmi;
           const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
                           (ref_frame == ref_mbmi->ref_frame[0]);
           skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
         }
       }
 
-      if ((mi_col - 1) >= tile->mi_col_start) {
+      if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV)
-          ref_mv = xd->mi[-1].src_mi->mbmi.mv[0];
+          ref_mv = xd->mi[-1]->mbmi.mv[0];
         if (rf == NONE)
-          rf = xd->mi[-1].src_mi->mbmi.ref_frame[0];
+          rf = xd->mi[-1]->mbmi.ref_frame[0];
         for (i = 0; i < mi_height; ++i) {
-          ref_mbmi = &xd->mi[i * xd->mi_stride - 1].src_mi->mbmi;
+          ref_mbmi = &xd->mi[i * xd->mi_stride - 1]->mbmi;
           const_motion &= (ref_mv.as_int == ref_mbmi->mv[0].as_int) &&
                           (ref_frame == ref_mbmi->ref_frame[0]);
           skip_ref_frame &= (rf == ref_mbmi->ref_frame[0]);
@@ -3072,7 +3184,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (!cm->allow_comp_inter_inter)
+      if (!cpi->allow_comp_inter_inter)
         continue;
 
       // Skip compound inter modes if ARF is not available.
@@ -3153,7 +3265,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
-      vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+      memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                       NULL, bsize, tx_cache, best_rd);
       if (rate_y == INT_MAX)
@@ -3162,7 +3274,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
                                   pd->subsampling_y);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
+        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
@@ -3184,7 +3296,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                   &disable_skip, frame_mv,
                                   mi_row, mi_col,
                                   single_newmv, single_inter_filter,
-                                  single_skippable, &total_sse, best_rd);
+                                  single_skippable, &total_sse, best_rd,
+                                  &mask_filter, filter_cache);
       if (this_rd == INT64_MAX)
         continue;
 
@@ -3231,6 +3344,11 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
+    // Apply an adjustment to the rd value based on the similarity of the
+    // source variance and reconstructed variance.
+    rd_variance_adjustment(cpi, x, bsize, &this_rd,
+                           ref_frame, x->source_variance);
+
     if (ref_frame == INTRA_FRAME) {
     // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
@@ -3271,8 +3389,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
-        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
-                   sizeof(uint8_t) * ctx->num_4x4_blk);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+               sizeof(uint8_t) * ctx->num_4x4_blk);
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -3325,21 +3443,21 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
       /* keep record of best filter type */
       if (!mode_excluded && cm->interp_filter != BILINEAR) {
-        int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+        int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->interp_filter];
 
         for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
           int64_t adj_rd;
           if (ref == INT64_MAX)
             adj_rd = 0;
-          else if (rd_opt->filter_cache[i] == INT64_MAX)
+          else if (filter_cache[i] == INT64_MAX)
             // when early termination is triggered, the encoder does not have
             // access to the rate-distortion cost. it only knows that the cost
             // should be above the maximum valid value. hence it takes the known
             // maximum plus an arbitrary constant as the rate-distortion cost.
-            adj_rd = rd_opt->mask_filter - ref + 10;
+            adj_rd = mask_filter - ref + 10;
           else
-            adj_rd = rd_opt->filter_cache[i] - ref;
+            adj_rd = filter_cache[i] - ref;
 
           adj_rd += this_rd;
           best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
@@ -3420,7 +3538,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          !is_inter_block(&best_mbmode));
 
   if (!cpi->rc.is_src_frame_alt_ref)
-    update_rd_thresh_fact(cpi, bsize, best_mode_index);
+    vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                              sf->adaptive_rd_thresh, bsize, best_mode_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
@@ -3460,7 +3579,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (!x->skip && !x->select_tx_size) {
     int has_high_freq_coeff = 0;
     int plane;
-    int max_plane = is_inter_block(&xd->mi[0].src_mi->mbmi)
+    int max_plane = is_inter_block(&xd->mi[0]->mbmi)
                         ? MAX_MB_PLANE : 1;
     for (plane = 0; plane < max_plane; ++plane) {
       x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
@@ -3475,19 +3594,22 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     best_mode_skippable |= !has_high_freq_coeff;
   }
 
+  assert(best_mode_index >= 0);
+
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
                        best_tx_diff, best_filter_diff, best_mode_skippable);
 }
 
-void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
+void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi,
+                                        TileDataEnc *tile_data,
+                                        MACROBLOCK *x,
                                         RD_COST *rd_cost,
                                         BLOCK_SIZE bsize,
                                         PICK_MODE_CONTEXT *ctx,
                                         int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
-  RD_OPT *const rd_opt = &cpi->rd;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   unsigned char segment_id = mbmi->segment_id;
   const int comp_pred = 0;
   int i;
@@ -3522,12 +3644,6 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
   mbmi->mv[0].as_int = 0;
   x->skip = 1;
 
-  // Search for best switchable filter by checking the variance of
-  // pred error irrespective of whether the filter will be used
-  rd_opt->mask_filter = 0;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    rd_opt->filter_cache[i] = INT64_MAX;
-
   if (cm->interp_filter != BILINEAR) {
     best_filter = EIGHTTAP;
     if (cm->interp_filter == SWITCHABLE &&
@@ -3536,7 +3652,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
         mbmi->interp_filter = i;
-        rs = vp9_get_switchable_rate(cpi);
+        rs = vp9_get_switchable_rate(cpi, xd);
         if (rs < best_rs) {
           best_rs = rs;
           best_filter = mbmi->interp_filter;
@@ -3547,7 +3663,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
   // Set the appropriate filter
   if (cm->interp_filter == SWITCHABLE) {
     mbmi->interp_filter = best_filter;
-    rate2 += vp9_get_switchable_rate(cpi);
+    rate2 += vp9_get_switchable_rate(cpi, xd);
   } else {
     mbmi->interp_filter = cm->interp_filter;
   }
@@ -3573,7 +3689,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
 
-  update_rd_thresh_fact(cpi, bsize, THR_ZEROMV);
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   vp9_zero(best_pred_diff);
   vp9_zero(best_filter_diff);
@@ -3585,18 +3702,20 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, MACROBLOCK *x,
                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
 }
 
-void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
-                                   const TileInfo *const tile,
+void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi,
+                                   TileDataEnc *tile_data,
+                                   MACROBLOCK *x,
                                    int mi_row, int mi_col,
                                    RD_COST *rd_cost,
                                    BLOCK_SIZE bsize,
                                    PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far) {
   VP9_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const struct segmentation *const seg = &cm->seg;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = mbmi->segment_id;
@@ -3627,11 +3746,16 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
+  int64_t mask_filter = 0;
+  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
-  vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
+  memset(x->zcoeff_blk[TX_4X4], 0, 4);
   vp9_zero(best_mbmode);
 
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    filter_cache[i] = INT64_MAX;
+
   for (i = 0; i < 4; i++) {
     int j;
     for (j = 0; j < MAX_REF_FRAMES; j++)
@@ -3651,10 +3775,10 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
-      setup_buffer_inter(cpi, x, tile,
-                             ref_frame, bsize, mi_row, mi_col,
-                             frame_mv[NEARESTMV], frame_mv[NEARMV],
-                             yv12_mb);
+      setup_buffer_inter(cpi, x, tile_info,
+                         ref_frame, bsize, mi_row, mi_col,
+                         frame_mv[NEARESTMV], frame_mv[NEARMV],
+                         yv12_mb);
     } else {
       ref_frame_skip_mask[0] |= (1 << ref_frame);
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
@@ -3705,19 +3829,19 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
-        ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
+    if ((ref_frame_skip_mask[0] & (1 << ref_frame)) &&
+        (ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame))))
       continue;
 
     // Test best rd so far against threshold for trying this mode.
     if (rd_less_than_thresh(best_rd,
                             rd_opt->threshes[segment_id][bsize][ref_index],
-                            rd_opt->thresh_freq_fact[bsize][ref_index]))
+                            tile_data->thresh_freq_fact[bsize][ref_index]))
       continue;
 
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
-      if (!cm->allow_comp_inter_inter)
+      if (!cpi->allow_comp_inter_inter)
         continue;
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
@@ -3791,7 +3915,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += distortion_y;
 
       if (rate_uv_intra == INT_MAX) {
-        choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
+        choose_intra_uv_mode(cpi, x, ctx, bsize, TX_4X4,
                              &rate_uv_intra,
                              &rate_uv_tokenonly,
                              &dist_uv, &skip_uv,
@@ -3824,9 +3948,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
           rd_opt->threshes[segment_id][bsize][THR_ALTR];
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
       rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
-      rd_opt->mask_filter = 0;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-        rd_opt->filter_cache[i] = INT64_MAX;
+        filter_cache[i] = INT64_MAX;
 
       if (cm->interp_filter != BILINEAR) {
         tmp_best_filter = EIGHTTAP;
@@ -3845,7 +3968,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             int newbest, rs;
             int64_t rs_rd;
             mbmi->interp_filter = switchable_filter_index;
-            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+            tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
                                               &mbmi->ref_mvs[ref_frame][0],
                                               second_ref, best_yrd, &rate,
                                               &rate_y, &distortion,
@@ -3856,16 +3979,16 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
             if (tmp_rd == INT64_MAX)
               continue;
-            rs = vp9_get_switchable_rate(cpi);
+            rs = vp9_get_switchable_rate(cpi, xd);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            rd_opt->filter_cache[switchable_filter_index] = tmp_rd;
-            rd_opt->filter_cache[SWITCHABLE_FILTERS] =
-                MIN(rd_opt->filter_cache[SWITCHABLE_FILTERS],
+            filter_cache[switchable_filter_index] = tmp_rd;
+            filter_cache[SWITCHABLE_FILTERS] =
+                MIN(filter_cache[SWITCHABLE_FILTERS],
                     tmp_rd + rs_rd);
             if (cm->interp_filter == SWITCHABLE)
               tmp_rd += rs_rd;
 
-            rd_opt->mask_filter = MAX(rd_opt->mask_filter, tmp_rd);
+            mask_filter = MAX(mask_filter, tmp_rd);
 
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
@@ -3883,7 +4006,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
               tmp_best_skippable = skippable;
               tmp_best_mbmode = *mbmi;
               for (i = 0; i < 4; i++) {
-                tmp_best_bmodes[i] = xd->mi[0].src_mi->bmi[i];
+                tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
               }
               pred_exists = 1;
@@ -3911,7 +4034,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
-        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile,
+        tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info,
                                           &mbmi->ref_mvs[ref_frame][0],
                                           second_ref, best_yrd, &rate, &rate_y,
                                           &distortion, &skippable, &total_sse,
@@ -3927,14 +4050,14 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         skippable = tmp_best_skippable;
         *mbmi = tmp_best_mbmode;
         for (i = 0; i < 4; i++)
-          xd->mi[0].src_mi->bmi[i] = tmp_best_bmodes[i];
+          xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
       }
 
       rate2 += rate;
       distortion2 += distortion;
 
       if (cm->interp_filter == SWITCHABLE)
-        rate2 += vp9_get_switchable_rate(cpi);
+        rate2 += vp9_get_switchable_rate(cpi, xd);
 
       if (!mode_excluded)
         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
@@ -3951,7 +4074,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         // then dont bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        vpx_memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+        memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                               &uv_sse, BLOCK_8X8, tmp_best_rdu))
           continue;
@@ -4032,11 +4155,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         best_skip2 = this_skip2;
         if (!x->select_tx_size)
           swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
-        vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
-                   sizeof(uint8_t) * ctx->num_4x4_blk);
+        memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
+               sizeof(uint8_t) * ctx->num_4x4_blk);
 
         for (i = 0; i < 4; i++)
-          best_bmodes[i] = xd->mi[0].src_mi->bmi[i];
+          best_bmodes[i] = xd->mi[0]->bmi[i];
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -4089,20 +4212,20 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     /* keep record of best filter type */
     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
         cm->interp_filter != BILINEAR) {
-      int64_t ref = rd_opt->filter_cache[cm->interp_filter == SWITCHABLE ?
+      int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
                               SWITCHABLE_FILTERS : cm->interp_filter];
       int64_t adj_rd;
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
         if (ref == INT64_MAX)
           adj_rd = 0;
-        else if (rd_opt->filter_cache[i] == INT64_MAX)
+        else if (filter_cache[i] == INT64_MAX)
           // when early termination is triggered, the encoder does not have
           // access to the rate-distortion cost. it only knows that the cost
           // should be above the maximum valid value. hence it takes the known
           // maximum plus an arbitrary constant as the rate-distortion cost.
-          adj_rd = rd_opt->mask_filter - ref + 10;
+          adj_rd = mask_filter - ref + 10;
         else
-          adj_rd = rd_opt->filter_cache[i] - ref;
+          adj_rd = filter_cache[i] - ref;
 
         adj_rd += this_rd;
         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
@@ -4146,21 +4269,21 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
 
-  update_rd_thresh_fact(cpi, bsize, best_ref_index);
+  vp9_update_rd_thresh_fact(tile_data->thresh_freq_fact,
+                            sf->adaptive_rd_thresh, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
   if (!is_inter_block(&best_mbmode)) {
     for (i = 0; i < 4; i++)
-      xd->mi[0].src_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
+      xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   } else {
     for (i = 0; i < 4; ++i)
-      vpx_memcpy(&xd->mi[0].src_mi->bmi[i], &best_bmodes[i],
-                 sizeof(b_mode_info));
+      memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
 
-    mbmi->mv[0].as_int = xd->mi[0].src_mi->bmi[3].as_mv[0].as_int;
-    mbmi->mv[1].as_int = xd->mi[0].src_mi->bmi[3].as_mv[1].as_int;
+    mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
@@ -4186,4 +4309,3 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   store_coding_context(x, ctx, best_ref_index,
                        best_pred_diff, best_tx_diff, best_filter_diff, 0);
 }
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
index ed38ce81a2e..459b0324bcf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_rdopt.h
@@ -29,14 +29,25 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
-void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
-                               const struct TileInfo *const tile,
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd);
+#endif
+
+void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
+                               struct TileDataEnc *tile_data,
+                               struct macroblock *x,
                                int mi_row, int mi_col,
                                struct RD_COST *rd_cost,
                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                int64_t best_rd_so_far);
 
 void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
+                                        struct TileDataEnc *tile_data,
                                         struct macroblock *x,
                                         struct RD_COST *rd_cost,
                                         BLOCK_SIZE bsize,
@@ -44,8 +55,8 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(struct VP9_COMP *cpi,
                                         int64_t best_rd_so_far);
 
 void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
+                                   struct TileDataEnc *tile_data,
                                    struct macroblock *x,
-                                   const struct TileInfo *const tile,
                                    int mi_row, int mi_col,
                                    struct RD_COST *rd_cost,
                                    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
index 3d361d4f267..2ebdff291d6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_resize.c
@@ -516,6 +516,10 @@ void vp9_resize_plane(const uint8_t *const input,
   uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) *
                                       (width < height ? height : width));
   uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2));
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
   for (i = 0; i < height; ++i)
     resize_multistep(input + in_stride * i, width,
                         intbuf + width2 * i, width2, tmpbuf);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
index f1d51770ab3..9b15072e98e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_segmentation.c
@@ -36,11 +36,7 @@ void vp9_set_segment_data(struct segmentation *seg,
                           unsigned char abs_delta) {
   seg->abs_delta = abs_delta;
 
-  vpx_memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
-
-  // TBD ?? Set the feature mask
-  // vpx_memcpy(cpi->mb.e_mbd.segment_feature_mask, 0,
-  //            sizeof(cpi->mb.e_mbd.segment_feature_mask));
+  memcpy(seg->feature_data, feature_data, sizeof(seg->feature_data));
 }
 void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
@@ -111,7 +107,7 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
 }
 
 static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
-                       const TileInfo *tile, MODE_INFO *mi,
+                       const TileInfo *tile, MODE_INFO **mi,
                        int *no_pred_segcounts,
                        int (*temporal_predictor_count)[2],
                        int *t_unpred_seg_counts,
@@ -122,7 +118,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
     return;
 
   xd->mi = mi;
-  segment_id = xd->mi[0].src_mi->mbmi.segment_id;
+  segment_id = xd->mi[0]->mbmi.segment_id;
 
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
 
@@ -131,7 +127,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
 
   // Temporal prediction not allowed on key frames
   if (cm->frame_type != KEY_FRAME) {
-    const BLOCK_SIZE bsize = xd->mi[0].src_mi->mbmi.sb_type;
+    const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
     // Test to see if the segment id matches the predicted value.
     const int pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
                                                    bsize, mi_row, mi_col);
@@ -140,7 +136,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    xd->mi[0].src_mi->mbmi.seg_id_predicted = pred_flag;
+    xd->mi[0]->mbmi.seg_id_predicted = pred_flag;
     temporal_predictor_count[pred_context][pred_flag]++;
 
     // Update the "unpredicted" segment count
@@ -150,7 +146,7 @@ static void count_segs(const VP9_COMMON *cm, MACROBLOCKD *xd,
 }
 
 static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
-                          const TileInfo *tile, MODE_INFO *mi,
+                          const TileInfo *tile, MODE_INFO **mi,
                           int *no_pred_segcounts,
                           int (*temporal_predictor_count)[2],
                           int *t_unpred_seg_counts,
@@ -163,8 +159,8 @@ static void count_segs_sb(const VP9_COMMON *cm, MACROBLOCKD *xd,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  bw = num_8x8_blocks_wide_lookup[mi[0].src_mi->mbmi.sb_type];
-  bh = num_8x8_blocks_high_lookup[mi[0].src_mi->mbmi.sb_type];
+  bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
+  bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
 
   if (bw == bs && bh == bs) {
     count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
@@ -217,20 +213,20 @@ void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) {
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
-  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
-  vpx_memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
+  memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
   for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
     TileInfo tile;
-    MODE_INFO *mi_ptr;
+    MODE_INFO **mi_ptr;
     vp9_tile_init(&tile, cm, 0, tile_col);
 
-    mi_ptr = cm->mi + tile.mi_col_start;
+    mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
     for (mi_row = 0; mi_row < cm->mi_rows;
          mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
-      MODE_INFO *mi = mi_ptr;
+      MODE_INFO **mi = mi_ptr;
       for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
            mi_col += 8, mi += 8)
         count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
@@ -267,11 +263,11 @@ void vp9_choose_segmap_coding_method(VP9_COMMON *cm, MACROBLOCKD *xd) {
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
     seg->temporal_update = 1;
-    vpx_memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+    memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
   } else {
     seg->temporal_update = 0;
-    vpx_memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
+    memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
 
@@ -280,6 +276,6 @@ void vp9_reset_segment_features(struct segmentation *seg) {
   seg->enabled = 0;
   seg->update_map = 0;
   seg->update_data = 0;
-  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
   vp9_clearall_segfeatures(seg);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.c
new file mode 100644
index 00000000000..1cb0662834e
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_skin_detection.h"
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[2] = {7463, 9614};                 // q6
+static const int skin_inv_cov[4] = {4107, 1663, 1663, 2157};  // q16
+static const int skin_threshold = 1570636;                    // q18
+
+// Thresholds on luminance.
+static const int y_low = 20;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(int cb, int cr) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 = (cb_q6 - skin_mean[0]) * (cb_q6 - skin_mean[0]);
+  const int cbcr_diff_q12 = (cb_q6 - skin_mean[0]) * (cr_q6 - skin_mean[1]);
+  const int cr_diff_q12 = (cr_q6 - skin_mean[1]) * (cr_q6 - skin_mean[1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff = skin_inv_cov[0] * cb_diff_q2 +
+      skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 +
+      skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr) {
+  if (y < y_low || y > y_high)
+    return 0;
+  else
+    return (evaluate_skin_color_difference(cb, cr) < skin_threshold);
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mi_row, mi_col;
+  VP9_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
+  if (vp9_alloc_frame_buffer(&skinmap, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment)) {
+      vp9_free_frame_buffer(&skinmap);
+      return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through 8x8 blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  // Ignore rightmost/bottom boundary blocks.
+  for (mi_row = 0; mi_row < cm->mi_rows - 1; ++mi_row) {
+    for (mi_col = 0; mi_col < cm->mi_cols - 1; ++mi_col) {
+      // Use middle pixel for each 8x8 block for skin detection.
+      // If middle pixel is skin, assign whole 8x8 block to skin.
+      const uint8_t ysource = src_y[4 * src_ystride + 4];
+      const uint8_t usource = src_u[2 * src_uvstride + 2];
+      const uint8_t vsource = src_v[2 * src_uvstride + 2];
+      const int is_skin = vp9_skin_pixel(ysource, usource, vsource);
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          if (is_skin)
+            y[i * src_ystride + j] = 255;
+          else
+            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+        }
+      }
+      y += 8;
+      src_y += 8;
+      src_u += 4;
+      src_v += 4;
+    }
+    y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+    src_y += (src_ystride << 3) - ((cm->mi_cols - 1) << 3);
+    src_u += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+    src_v += (src_uvstride << 2) - ((cm->mi_cols - 1) << 2);
+  }
+  vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vp9_free_frame_buffer(&skinmap);
+}
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.h
new file mode 100644
index 00000000000..3d4e7375f76
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_
+#define VP9_ENCODER_VP9_SKIN_MAP_H_
+
+#include "vp9/common/vp9_blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP9_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_VP9_SKIN_MAP_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
index 9e3ee2c94bd..4f93578326b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c
@@ -16,12 +16,84 @@
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const VP9_COMP *cpi) {
-  return frame_is_intra_only(&cpi->common) ||
-         cpi->refresh_alt_ref_frame ||
-         (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref) ||
-         vp9_is_upper_layer_key_frame(cpi);
+  return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi);
 }
 
+static void set_good_speed_feature_framesize_dependent(VP9_COMMON *cm,
+                                                       SPEED_FEATURES *sf,
+                                                       int speed) {
+  if (speed >= 1) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+      sf->partition_search_breakout_dist_thr = (1 << 21);
+    }
+  }
+
+  if (speed >= 2) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+      sf->adaptive_pred_interp_filter = 0;
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+      sf->partition_search_breakout_rate_thr = 120;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+      sf->partition_search_breakout_dist_thr = (1 << 22);
+      sf->partition_search_breakout_rate_thr = 100;
+    }
+  }
+
+  if (speed >= 3) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = DISABLE_ALL_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+      sf->partition_search_breakout_rate_thr = 200;
+    } else {
+      sf->max_intra_bsize = BLOCK_32X32;
+      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+      sf->partition_search_breakout_rate_thr = 120;
+    }
+  }
+
+  if (speed >= 4) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 26);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 24);
+    }
+    sf->disable_split_mask = DISABLE_ALL_SPLIT;
+  }
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Select block size based on image format size.
+  if (screen_area < 1280 * 720) {
+    // Formats smaller in area than 720P
+    return BLOCK_4X4;
+  } else if (screen_area < 1920 * 1080) {
+    // Format >= 720P and < 1080P
+    return BLOCK_8X8;
+  } else {
+    // Formats 1080P and up
+    return BLOCK_16X16;
+  }
+}
 
 static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
                                    SPEED_FEATURES *sf, int speed) {
@@ -34,11 +106,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check  = 1;
 
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
     sf->mv.auto_mv_step_size = 1;
@@ -54,11 +121,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
 
     sf->tx_size_search_breakout = 1;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-    else
-      sf->partition_search_breakout_dist_thr = (1 << 21);
     sf->partition_search_breakout_rate_thr = 80;
   }
 
@@ -66,45 +128,24 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
     sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
                                                       : USE_LARGESTALL;
 
-    if (MIN(cm->width, cm->height) >= 720) {
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-      sf->adaptive_pred_interp_filter = 0;
-      sf->partition_search_breakout_dist_thr = (1 << 24);
-      sf->partition_search_breakout_rate_thr = 120;
-    } else {
-      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-      sf->partition_search_breakout_dist_thr = (1 << 22);
-      sf->partition_search_breakout_rate_thr = 100;
-    }
+    // Reference masking is not supported in dynamic scaling mode.
+    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
 
-    sf->reference_masking = 1;
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
                                  FLAG_SKIP_INTRA_BESTINTER |
                                  FLAG_SKIP_COMP_BESTINTRA |
                                  FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
-    sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
-
+    sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+    sf->rd_auto_partition_min_limit = set_partition_min_limit(cpi);
     sf->allow_partition_search_skip = 1;
   }
 
   if (speed >= 3) {
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
-    if (MIN(cm->width, cm->height) >= 720) {
-      sf->disable_split_mask = DISABLE_ALL_SPLIT;
-      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
-      sf->partition_search_breakout_dist_thr = (1 << 25);
-      sf->partition_search_breakout_rate_thr = 200;
-    } else {
-      sf->max_intra_bsize = BLOCK_32X32;
-      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
-      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
-      sf->partition_search_breakout_dist_thr = (1 << 23);
-      sf->partition_search_breakout_rate_thr = 120;
-    }
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
@@ -122,28 +163,21 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
   if (speed >= 4) {
     sf->use_square_partition_only = 1;
     sf->tx_size_search_method = USE_LARGESTALL;
-    sf->disable_split_mask = DISABLE_ALL_SPLIT;
     sf->mv.search_method = BIGDIA;
     sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
     sf->adaptive_rd_thresh = 4;
-    sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
+    if (cm->frame_type != KEY_FRAME)
+      sf->mode_search_skip_flags |= FLAG_EARLY_TERMINATE;
     sf->disable_filter_search_var_thresh = 200;
     sf->use_lp32x32fdct = 1;
     sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->use_fast_coef_costing = 1;
     sf->motion_field_mode_search = !boosted;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->partition_search_breakout_dist_thr = (1 << 26);
-    else
-      sf->partition_search_breakout_dist_thr = (1 << 24);
     sf->partition_search_breakout_rate_thr = 300;
   }
 
   if (speed >= 5) {
     int i;
-
-    sf->partition_search_type = FIXED_PARTITION;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
@@ -151,12 +185,47 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
       sf->intra_y_mode_mask[i] = INTRA_DC;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
     }
-  }
-  if (speed >= 6) {
+    sf->partition_search_breakout_rate_thr = 500;
     sf->mv.reduce_first_step_size = 1;
   }
 }
 
+static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
+    SPEED_FEATURES *sf, int speed) {
+  VP9_COMMON *const cm = &cpi->common;
+
+  if (speed >= 1) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+    }
+  }
+
+  if (speed >= 2) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
+                                              : DISABLE_ALL_INTER_SPLIT;
+    } else {
+      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+    }
+  }
+
+  if (speed >= 5) {
+    if (MIN(cm->width, cm->height) >= 720) {
+      sf->partition_search_breakout_dist_thr = (1 << 25);
+    } else {
+      sf->partition_search_breakout_dist_thr = (1 << 23);
+    }
+  }
+
+  if (speed >= 7) {
+    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
+        800 : 300;
+  }
+}
+
 static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
                                  int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
@@ -172,12 +241,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
                                                         : USE_LARGESTALL;
 
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
-
     sf->use_rd_breakout = 1;
 
     sf->adaptive_motion_search = 1;
@@ -190,22 +253,19 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   }
 
   if (speed >= 2) {
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
-                                              : DISABLE_ALL_INTER_SPLIT;
-    else
-      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
-
-    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
+    sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
+                                 FLAG_SKIP_INTRA_DIRMISMATCH |
                                  FLAG_SKIP_INTRA_BESTINTER |
                                  FLAG_SKIP_COMP_BESTINTRA |
                                  FLAG_SKIP_INTRA_LOWVAR;
     sf->adaptive_pred_interp_filter = 2;
-    sf->reference_masking = 1;
+
+    // Reference masking is not supported in dynamic scaling mode.
+    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
+
     sf->disable_filter_search_var_thresh = 50;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_LOW_MOTION;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
     sf->last_partitioning_redo_frequency = 3;
@@ -217,12 +277,9 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
   if (speed >= 3) {
     sf->use_square_partition_only = 1;
     sf->disable_filter_search_var_thresh = 100;
-    sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
-    sf->constrain_copy_partition = 1;
     sf->use_uv_intra_rd_estimate = 1;
     sf->skip_encode_sb = 1;
     sf->mv.subpel_iters_per_step = 1;
-    sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
     sf->allow_skip_recode = 0;
@@ -261,10 +318,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->use_quant_fp = !is_keyframe;
     sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
                                                   : STRICT_NEIGHBORING_MIN_MAX;
-    sf->max_partition_size = BLOCK_32X32;
-    sf->min_partition_size = BLOCK_8X8;
-    sf->partition_check =
-        (frames_since_key % sf->last_partitioning_redo_frequency == 1);
+    sf->default_max_partition_size = BLOCK_32X32;
+    sf->default_min_partition_size = BLOCK_8X8;
     sf->force_frame_boost = is_keyframe ||
         (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = is_keyframe ? 20 : 15;
@@ -275,63 +330,87 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
-
-    if (MIN(cm->width, cm->height) >= 720)
-      sf->partition_search_breakout_dist_thr = (1 << 25);
-    else
-      sf->partition_search_breakout_dist_thr = (1 << 23);
+    sf->adaptive_rd_thresh = 2;
+    // This feature is only enabled when partition search is disabled.
+    sf->reuse_inter_pred_sby = 1;
     sf->partition_search_breakout_rate_thr = 200;
-  }
+    sf->coeff_prob_appx_step = 4;
+    sf->use_fast_coef_updates = is_keyframe ? TWO_LOOP : ONE_LOOP_REDUCED;
+    sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH;
 
-  if (speed >= 6) {
-    if (content == VP9E_CONTENT_SCREEN) {
+    if (!is_keyframe) {
       int i;
-      // Allow fancy modes at all sizes since SOURCE_VAR_BASED_PARTITION is used
-      for (i = 0; i < BLOCK_SIZES; ++i)
-        sf->inter_mode_mask[i] = INTER_NEAREST_NEAR_NEW;
+      if (content == VP9E_CONTENT_SCREEN) {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+      } else {
+        for (i = 0; i < BLOCK_SIZES; ++i)
+          if (i >= BLOCK_16X16)
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
+          else
+            // Use H and V intra mode for block sizes <= 16X16.
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+      }
     }
+  }
 
+  if (speed >= 6) {
     // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION.
     sf->partition_search_type = VAR_BASED_PARTITION;
-    sf->search_type_check_frequency = 50;
+    // Turn on this to use non-RD key frame coding mode.
+    sf->use_nonrd_pick_mode = 1;
     sf->mv.search_method = NSTEP;
-
     sf->tx_size_search_method = is_keyframe ? USE_LARGESTALL : USE_TX_8X8;
-
-    // This feature is only enabled when partition search is disabled.
-    sf->reuse_inter_pred_sby = 1;
-
-    // Increase mode checking threshold for NEWMV.
-    sf->elevate_newmv_thresh = 1000;
-
     sf->mv.reduce_first_step_size = 1;
+    sf->skip_encode_sb = 0;
   }
 
   if (speed >= 7) {
+    sf->adaptive_rd_thresh = 3;
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
+  }
+  if (speed >= 8) {
+    sf->adaptive_rd_thresh = 4;
+    sf->mv.subpel_force_stop = 2;
     sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
-    sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
-        800 : 300;
-    sf->elevate_newmv_thresh = 2500;
   }
+}
 
-  if (speed >= 12) {
-    sf->elevate_newmv_thresh = 4000;
-    sf->mv.subpel_force_stop = 2;
+void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RD_OPT *const rd = &cpi->rd;
+  int i;
+
+  if (oxcf->mode == REALTIME) {
+    set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+  } else if (oxcf->mode == GOOD) {
+    set_good_speed_feature_framesize_dependent(cm, sf, oxcf->speed);
   }
 
-  if (speed >= 13) {
-    int i;
-    sf->max_intra_bsize = BLOCK_32X32;
-    for (i = 0; i < BLOCK_SIZES; ++i)
-      sf->inter_mode_mask[i] = INTER_NEAREST;
+  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+    sf->adaptive_pred_interp_filter = 0;
+  }
+
+  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
+      sf->encode_breakout_thresh > cpi->encode_breakout) {
+    cpi->encode_breakout = sf->encode_breakout_thresh;
+  }
+
+  // Check for masked out split cases.
+  for (i = 0; i < MAX_REFS; ++i) {
+    if (sf->disable_split_mask & (1 << i)) {
+      rd->thresh_mult_sub8x8[i] = INT_MAX;
+    }
   }
 }
 
-void vp9_set_speed_features(VP9_COMP *cpi) {
+void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
   VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->td.mb;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
 
@@ -344,11 +423,11 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->mv.subpel_force_stop = 0;
   sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
   sf->mv.reduce_first_step_size = 0;
+  sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
   sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
-  sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_OFF;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
@@ -364,11 +443,11 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
-  sf->max_partition_size = BLOCK_64X64;
-  sf->min_partition_size = BLOCK_4X4;
+  sf->rd_auto_partition_min_limit = BLOCK_4X4;
+  sf->default_max_partition_size = BLOCK_64X64;
+  sf->default_min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
-  sf->constrain_copy_partition = 0;
   sf->disable_split_mask = 0;
   sf->mode_search_skip_flags = 0;
   sf->force_frame_boost = 0;
@@ -400,8 +479,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->always_this_block_size = BLOCK_16X16;
   sf->search_type_check_frequency = 50;
   sf->encode_breakout_thresh = 0;
-  sf->elevate_newmv_thresh = 0;
-  // Recode loop tolerence %.
+  // Recode loop tolerance %.
   sf->recode_tolerance = 25;
   sf->default_interp_filter = SWITCHABLE;
   sf->tx_size_search_breakout = 0;
@@ -416,8 +494,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = oxcf->mode == BEST ? vp9_full_range_search
                                                : vp9_diamond_search_sad;
-  cpi->refining_search_sad = vp9_refining_search_sad;
-
 
   // Slow quant, dct and trellis not worthwhile for first pass
   // so make sure they are always turned off.
@@ -440,16 +516,12 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
-  cpi->mb.optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
+  x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
 
-  if (sf->disable_split_mask == DISABLE_ALL_SPLIT)
-    sf->adaptive_pred_interp_filter = 0;
+  x->min_partition_size = sf->default_min_partition_size;
+  x->max_partition_size = sf->default_max_partition_size;
 
   if (!cpi->oxcf.frame_periodic_boost) {
     sf->max_delta_qindex = 0;
   }
-
-  if (cpi->encode_breakout && oxcf->mode == REALTIME &&
-      sf->encode_breakout_thresh > cpi->encode_breakout)
-    cpi->encode_breakout = sf->encode_breakout_thresh;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
index 951b4af2276..8575638d9a7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h
@@ -93,12 +93,6 @@ typedef enum {
 } MOTION_THRESHOLD;
 
 typedef enum {
-  LAST_FRAME_PARTITION_OFF = 0,
-  LAST_FRAME_PARTITION_LOW_MOTION = 1,
-  LAST_FRAME_PARTITION_ALL = 2
-} LAST_FRAME_PARTITION_METHOD;
-
-typedef enum {
   USE_FULL_RD = 0,
   USE_LARGESTALL,
   USE_TX_8X8
@@ -107,8 +101,7 @@ typedef enum {
 typedef enum {
   NOT_IN_USE = 0,
   RELAXED_NEIGHBORING_MIN_MAX = 1,
-  CONSTRAIN_NEIGHBORING_MIN_MAX = 2,
-  STRICT_NEIGHBORING_MIN_MAX = 3
+  STRICT_NEIGHBORING_MIN_MAX = 2
 } AUTO_MIN_MAX_MODE;
 
 typedef enum {
@@ -169,12 +162,9 @@ typedef enum {
   // before the final run.
   TWO_LOOP = 0,
 
-  // No dry run conducted.
-  ONE_LOOP = 1,
-
   // No dry run, also only half the coef contexts and bands are updated.
   // The rest are not updated at all.
-  ONE_LOOP_REDUCED = 2
+  ONE_LOOP_REDUCED = 1
 } FAST_COEFF_UPDATE;
 
 typedef struct MV_SPEED_FEATURES {
@@ -242,14 +232,8 @@ typedef struct SPEED_FEATURES {
   // level within a frame.
   int allow_skip_recode;
 
-  // This variable allows us to reuse the last frames partition choices
-  // (64x64 v 32x32 etc) for this frame. It can be set to only use the last
-  // frame as a starting point in low motion scenes or always use it. If set
-  // we use last partitioning_redo frequency to determine how often to redo
-  // the partitioning from scratch. Adjust_partitioning_from_last_frame
-  // enables us to adjust up or down one partitioning from the last frames
-  // partitioning.
-  LAST_FRAME_PARTITION_METHOD use_lastframe_partitioning;
+  // Coefficient probability model approximation step size
+  int coeff_prob_appx_step;
 
   // The threshold is to determine how slow the motino is, it is used when
   // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION
@@ -264,8 +248,6 @@ typedef struct SPEED_FEATURES {
   // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
 
-  // TODO(JBB): remove this as its no longer used.
-
   // After looking at the first set of modes (set by index here), skip
   // checking modes for reference frames that don't match the reference frame
   // of the best so far.
@@ -289,11 +271,14 @@ typedef struct SPEED_FEATURES {
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+  // Ensures the rd based auto partition search will always
+  // go down at least to the specified level.
+  BLOCK_SIZE rd_auto_partition_min_limit;
 
   // Min and max partition size we enable (block_size) as per auto
   // min max, but also used by adjust partitioning, and pick_partitioning.
-  BLOCK_SIZE min_partition_size;
-  BLOCK_SIZE max_partition_size;
+  BLOCK_SIZE default_min_partition_size;
+  BLOCK_SIZE default_max_partition_size;
 
   // Whether or not we allow partitions one smaller or one greater than the last
   // frame's partitioning. Only used if use_lastframe_partitioning is set.
@@ -303,12 +288,6 @@ typedef struct SPEED_FEATURES {
   // use_lastframe_partitioning is set.
   int last_partitioning_redo_frequency;
 
-  // This enables constrained copy partitioning, which, given an input block
-  // size bsize, will copy previous partition for partitions less than bsize,
-  // otherwise bsize partition is used. bsize is currently set to 16x16.
-  // Used for the case where motion is detected in superblock.
-  int constrain_copy_partition;
-
   // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
   // it always, to allow it for only Last frame and Intra, disable it for all
   // inter modes or to enable it always.
@@ -342,10 +321,6 @@ typedef struct SPEED_FEATURES {
   // Fast quantization process path
   int use_quant_fp;
 
-  // Search through variable block partition types in non-RD mode decision
-  // encoding process for RTC.
-  int partition_check;
-
   // Use finer quantizer in every other few frames that run variable block
   // partition type search.
   int force_frame_boost;
@@ -367,6 +342,10 @@ typedef struct SPEED_FEATURES {
   int intra_y_mode_mask[TX_SIZES];
   int intra_uv_mode_mask[TX_SIZES];
 
+  // These bit masks allow you to enable or disable intra modes for each
+  // prediction block size separately.
+  int intra_y_mode_bsize_mask[BLOCK_SIZES];
+
   // This variable enables an early break out of mode testing if the model for
   // rd built from the prediction signal indicates a value that's much
   // higher than the best rd we've seen so far.
@@ -417,9 +396,6 @@ typedef struct SPEED_FEATURES {
   // enabled in real time mode.
   int encode_breakout_thresh;
 
-  // In real time encoding, increase the threshold for NEWMV.
-  int elevate_newmv_thresh;
-
   // default interp filter choice
   INTERP_FILTER default_interp_filter;
 
@@ -443,11 +419,11 @@ typedef struct SPEED_FEATURES {
 
 struct VP9_COMP;
 
-void vp9_set_speed_features(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
 #endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
-
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
index 5dbfbf53bbc..88db5dda06d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.c
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
 #include "./vp9_rtcd.h"
-
 #include "vp9/encoder/vp9_ssim.h"
 
 void vp9_ssim_parms_16x16_c(uint8_t *s, int sp, uint8_t *r,
@@ -201,6 +201,251 @@ double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
   return ssim_all;
 }
 
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+double ssimv_similarity(Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+      (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s + n * sv->sum_sq_r
+         - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+double ssimv_similarity2(Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2)
+      / (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+         n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2, int img2_pitch,
+                 Ssimv *sv) {
+  vp9_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch,
+                     &sv->sum_s, &sv->sum_r, &sv->sum_sq_s, &sv->sum_sq_r,
+                     &sv->sum_sxr);
+}
+
+double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch,
+                            uint8_t *img2, int img2_pitch,
+                            int width, int height,
+                            Ssimv *sv2, Metrics *m,
+                            int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+  vp9_clear_system_state();
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height; i += 4,
+       img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = {0};
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term = (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term = (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term = pow((2.0 * ssim_old * ssim_new + c3) /
+                               (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                               5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1)
+          ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0)
+    inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+
+
 #if CONFIG_VP9_HIGHBITDEPTH
 double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
index e75623b2545..10f14c4d268 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ssim.h
@@ -17,12 +17,64 @@ extern "C" {
 
 #include "vpx_scale/yv12config.h"
 
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+  // source sum ( over 8x8 region )
+  uint64_t sum_s;
+
+  // reference sum (over 8x8 region )
+  uint64_t sum_r;
+
+  // source sum squared ( over 8x8 region )
+  uint64_t sum_sq_s;
+
+  // reference sum squared (over 8x8 region )
+  uint64_t sum_sq_r;
+
+  // sum of source times reference (over 8x8 region)
+  uint64_t sum_sxr;
+
+  // calculated ssim score between source and reference
+  double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+  // ssim consistency error metric ( see code for explanation )
+  double ssimc;
+
+  // standard ssim
+  double ssim;
+
+  // revised ssim ( see code for explanation)
+  double ssim2;
+
+  // ssim restated as an error metric like sse
+  double dssim;
+
+  // dssim converted to decibels
+  double dssimd;
+
+  // ssimc converted to decibels
+  double ssimcd;
+} Metrics;
+
+double vp9_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                      int img2_pitch, int width, int height, Ssimv *sv2,
+                      Metrics *m, int do_inconsistency);
+
 double vp9_calc_ssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                      double *weight);
 
 double vp9_calc_ssimg(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
                       double *ssim_y, double *ssim_u, double *ssim_v);
 
+double vp9_calc_fastssim(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                         double *ssim_y, double *ssim_u, double *ssim_v);
+
+double vp9_psnrhvs(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest,
+                   double *ssim_y, double *ssim_u, double *ssim_v);
+
 #if CONFIG_VP9_HIGHBITDEPTH
 double vp9_highbd_calc_ssim(YV12_BUFFER_CONFIG *source,
                             YV12_BUFFER_CONFIG *dest,
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
index 530b5923bed..cfdc90d15fb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.c
@@ -140,12 +140,13 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd) {
+                                              vp9_prob upd,
+                                              int stepsize) {
   int i, old_b, new_b, update_b, savings, bestsavings, step;
   int newp;
   vp9_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
   vp9_model_to_full_probs(oldp, oldplist);
-  vpx_memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  memcpy(newplist, oldp, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
   for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
     old_b += cost_branch256(ct + 2 * i, oldplist[i]);
   old_b += cost_branch256(ct + 2 * PIVOT_NODE, oldplist[PIVOT_NODE]);
@@ -153,24 +154,44 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
   bestsavings = 0;
   bestnewp = oldp[PIVOT_NODE];
 
-  step = (*bestp > oldp[PIVOT_NODE] ? -1 : 1);
-
-  for (newp = *bestp; newp != oldp[PIVOT_NODE]; newp += step) {
-    if (newp < 1 || newp > 255)
-      continue;
-    newplist[PIVOT_NODE] = newp;
-    vp9_model_to_full_probs(newplist, newplist);
-    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
-      new_b += cost_branch256(ct + 2 * i, newplist[i]);
-    new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
-    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
-        vp9_cost_upd256;
-    savings = old_b - new_b - update_b;
-    if (savings > bestsavings) {
-      bestsavings = savings;
-      bestnewp = newp;
+  if (*bestp > oldp[PIVOT_NODE]) {
+    step = -stepsize;
+    for (newp = *bestp; newp > oldp[PIVOT_NODE]; newp += step) {
+      if (newp < 1 || newp > 255)
+        continue;
+      newplist[PIVOT_NODE] = newp;
+      vp9_model_to_full_probs(newplist, newplist);
+      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i]);
+      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+          vp9_cost_upd256;
+      savings = old_b - new_b - update_b;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
+    }
+  } else {
+    step = stepsize;
+    for (newp = *bestp; newp < oldp[PIVOT_NODE]; newp += step) {
+      if (newp < 1 || newp > 255)
+        continue;
+      newplist[PIVOT_NODE] = newp;
+      vp9_model_to_full_probs(newplist, newplist);
+      for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+        new_b += cost_branch256(ct + 2 * i, newplist[i]);
+      new_b += cost_branch256(ct + 2 * PIVOT_NODE, newplist[PIVOT_NODE]);
+      update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+          vp9_cost_upd256;
+      savings = old_b - new_b - update_b;
+      if (savings > bestsavings) {
+        bestsavings = savings;
+        bestnewp = newp;
+      }
     }
   }
+
   *bestp = bestnewp;
   return bestsavings;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
index 8e02a1d0d5e..ac54893cf45 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_subexp.h
@@ -30,7 +30,8 @@ int vp9_prob_diff_update_savings_search(const unsigned int *ct,
 int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
                                               const vp9_prob *oldp,
                                               vp9_prob *bestp,
-                                              vp9_prob upd);
+                                              vp9_prob upd,
+                                              int stepsize);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 1573557d471..b3491a27a4a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -39,13 +39,15 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                  cpi->common.use_highbitdepth,
 #endif
-                                 VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL))
+                                 VP9_ENC_BORDER_IN_PIXELS,
+                                 cpi->common.byte_alignment,
+                                 NULL, NULL, NULL))
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate empty frame for multiple frame "
                            "contexts");
 
-      vpx_memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
-                 cpi->svc.empty_frame.img.buffer_alloc_sz);
+      memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
+             cpi->svc.empty_frame.img.buffer_alloc_sz);
       cpi->svc.empty_frame_width = cpi->common.width;
       cpi->svc.empty_frame_height = cpi->common.height;
     }
@@ -77,6 +79,7 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       lc->target_bandwidth = oxcf->ts_target_bitrate[layer];
       lrc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
       lrc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+      lrc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
     } else {
       lc->target_bandwidth = oxcf->ss_target_bitrate[layer];
       lrc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
@@ -85,11 +88,11 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
                                           oxcf->best_allowed_q) / 2;
       lrc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
                                             oxcf->best_allowed_q) / 2;
-      if (oxcf->ss_play_alternate[layer])
+      if (oxcf->ss_enable_auto_arf[layer])
         lc->alt_ref_idx = alt_ref_idx++;
       else
-        lc->alt_ref_idx = -1;
-      lc->gold_ref_idx = -1;
+        lc->alt_ref_idx = INVALID_IDX;
+      lc->gold_ref_idx = INVALID_IDX;
     }
 
     lrc->buffer_level = oxcf->starting_buffer_level_ms *
@@ -192,7 +195,7 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
                                    oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
                                    oxcf->two_pass_vbrmax_section) / 100);
-  vp9_rc_set_gf_max_interval(cpi, lrc);
+  vp9_rc_set_gf_interval_range(cpi, lrc);
 }
 
 void vp9_restore_layer_context(VP9_COMP *const cpi) {
@@ -276,6 +279,7 @@ static void get_layer_resolution(const int width_org, const int height_org,
 int vp9_svc_start_frame(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   LAYER_CONTEXT *lc;
+  struct lookahead_entry *buf;
   int count = 1 << (cpi->svc.number_temporal_layers - 1);
 
   cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
@@ -305,7 +309,7 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
     }
   } else {
-    if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id]) {
+    if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
       cpi->alt_fb_idx = lc->alt_ref_idx;
       if (!lc->has_alt_frame)
         cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
@@ -317,7 +321,7 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
         LAYER_CONTEXT *lc_lower =
             &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
 
-        if (cpi->oxcf.ss_play_alternate[cpi->svc.spatial_layer_id - 1] &&
+        if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
             lc_lower->alt_ref_source != NULL)
           cpi->alt_fb_idx = lc_lower->alt_ref_idx;
         else if (cpi->svc.spatial_layer_id >= 2)
@@ -336,8 +340,12 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
   // since its previous frame could be changed during decoding time. The idea is
   // we put a empty invisible frame in front of them, then we will not use
   // prev_mi when encoding these frames.
+
+  buf = vp9_lookahead_peek(cpi->lookahead, 0);
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
-      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE) {
+      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
+      lc->rc.frames_to_key != 0 &&
+      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
     if ((cpi->svc.number_temporal_layers > 1 &&
          cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
         (cpi->svc.number_spatial_layers > 1 &&
@@ -372,13 +380,14 @@ int vp9_svc_start_frame(VP9_COMP *const cpi) {
     }
   }
 
-  if (vp9_set_size_literal(cpi, width, height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
-
   cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q);
   cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q);
 
   vp9_change_config(cpi, &cpi->oxcf);
+
+  if (vp9_set_size_literal(cpi, width, height) != 0)
+    return VPX_CODEC_INVALID_PARAM;
+
   vp9_set_high_precision_mv(cpi, 1);
 
   cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index 5599227ce77..d7979ab53a5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -44,7 +44,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
   const InterpKernel *const kernel =
-    vp9_get_interp_kernel(xd->mi[0].src_mi->mbmi.interp_filter);
+    vp9_get_interp_kernel(xd->mi[0]->mbmi.interp_filter);
 
   enum mv_precision mv_precision_uv;
   int uv_stride;
@@ -213,7 +213,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
                                               uint8_t *arf_frame_buf,
                                               uint8_t *frame_ptr_buf,
                                               int stride) {
-  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
   int step_param;
@@ -225,7 +225,7 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  MV *ref_mv = &x->e_mbd.mi[0].src_mi->bmi[0].as_mv[0].as_mv;
+  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
@@ -280,17 +280,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
   int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
   int mb_y_offset = 0;
   int mb_uv_offset = 0;
-  DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16 * 16 * 3);
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, count, 16 * 16 * 3);
-  MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+  DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED_ARRAY(16, uint16_t,  predictor16, 16 * 16 * 3);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor8, 16 * 16 * 3);
+  DECLARE_ALIGNED(16, uint16_t,  predictor16[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint8_t,  predictor8[16 * 16 * 3]);
   uint8_t *predictor;
 #else
-  DECLARE_ALIGNED_ARRAY(16, uint8_t,  predictor, 16 * 16 * 3);
+  DECLARE_ALIGNED(16, uint8_t,  predictor[16 * 16 * 3]);
 #endif
   const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
   const int mb_uv_width  = 16 >> mbd->plane[1].subsampling_x;
@@ -321,19 +321,19 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
     //  8 - VP9_INTERP_EXTEND.
     // To keep the mv in play for both Y and UV planes the max that it
     //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
-    cpi->mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-    cpi->mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
+    cpi->td.mb.mv_row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+    cpi->td.mb.mv_row_max = ((mb_rows - 1 - mb_row) * 16)
                          + (17 - 2 * VP9_INTERP_EXTEND);
 
     for (mb_col = 0; mb_col < mb_cols; mb_col++) {
       int i, j, k;
       int stride;
 
-      vpx_memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
-      vpx_memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+      memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+      memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
 
-      cpi->mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
-      cpi->mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
+      cpi->td.mb.mv_col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+      cpi->td.mb.mv_col_max = ((mb_cols - 1 - mb_col) * 16)
                            + (17 - 2 * VP9_INTERP_EXTEND);
 
       for (frame = 0; frame < frame_count; frame++) {
@@ -343,8 +343,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
         if (frames[frame] == NULL)
           continue;
 
-        mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.row = 0;
-        mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.col = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.row = 0;
+        mbd->mi[0]->bmi[0].as_mv[0].as_mv.col = 0;
 
         if (frame == alt_ref_index) {
           filter_weight = 2;
@@ -370,8 +370,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
               frames[frame]->v_buffer + mb_uv_offset,
               frames[frame]->y_stride,
               mb_uv_width, mb_uv_height,
-              mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.row,
-              mbd->mi[0].src_mi->bmi[0].as_mv[0].as_mv.col,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
+              mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
               predictor, scale,
               mb_col * 16, mb_row * 16);
 
@@ -653,6 +653,7 @@ static void adjust_arnr_filter(VP9_COMP *cpi,
 void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   int frame;
   int frames_to_blur;
   int start_frame;
@@ -709,8 +710,9 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
 #if CONFIG_VP9_HIGHBITDEPTH
                                        cm->use_highbitdepth,
 #endif
-                                       VP9_ENC_BORDER_IN_PIXELS, NULL, NULL,
-                                       NULL)) {
+                                       VP9_ENC_BORDER_IN_PIXELS,
+                                       cm->byte_alignment,
+                                       NULL, NULL, NULL)) {
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to reallocate alt_ref_buffer");
           }
@@ -720,8 +722,8 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
         }
       }
       cm->mi = cm->mip + cm->mi_stride + 1;
-      cpi->mb.e_mbd.mi = cm->mi;
-      cpi->mb.e_mbd.mi[0].src_mi = &cpi->mb.e_mbd.mi[0];
+      xd->mi = cm->mi_grid_visible;
+      xd->mi[0] = cm->mi;
     } else {
       // ARF is produced at the native frame size and resized when coded.
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
index adf01bf35ba..862be4d384a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.c
@@ -23,22 +23,32 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
-static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
-const TOKENVALUE *vp9_dct_value_tokens_ptr;
-static int16_t dct_value_cost[DCT_MAX_VALUE * 2];
-const int16_t *vp9_dct_value_cost_ptr;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static TOKENVALUE dct_value_tokens_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
-static int16_t dct_value_cost_high10[DCT_MAX_VALUE_HIGH10 * 2];
-const int16_t *vp9_dct_value_cost_high10_ptr;
-
-static TOKENVALUE dct_value_tokens_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
-static int16_t dct_value_cost_high12[DCT_MAX_VALUE_HIGH12 * 2];
-const int16_t *vp9_dct_value_cost_high12_ptr;
-#endif
+static const TOKENVALUE dct_cat_lt_10_value_tokens[] = {
+  {9, 63}, {9, 61}, {9, 59}, {9, 57}, {9, 55}, {9, 53}, {9, 51}, {9, 49},
+  {9, 47}, {9, 45}, {9, 43}, {9, 41}, {9, 39}, {9, 37}, {9, 35}, {9, 33},
+  {9, 31}, {9, 29}, {9, 27}, {9, 25}, {9, 23}, {9, 21}, {9, 19}, {9, 17},
+  {9, 15}, {9, 13}, {9, 11}, {9, 9}, {9, 7}, {9, 5}, {9, 3}, {9, 1},
+  {8, 31}, {8, 29}, {8, 27}, {8, 25}, {8, 23}, {8, 21},
+  {8, 19}, {8, 17}, {8, 15}, {8, 13}, {8, 11}, {8, 9},
+  {8, 7}, {8, 5}, {8, 3}, {8, 1},
+  {7, 15}, {7, 13}, {7, 11}, {7, 9}, {7, 7}, {7, 5}, {7, 3}, {7, 1},
+  {6, 7}, {6, 5}, {6, 3}, {6, 1}, {5, 3}, {5, 1},
+  {4, 1}, {3, 1}, {2, 1}, {1, 1}, {0, 0},
+  {1, 0},  {2, 0}, {3, 0}, {4, 0},
+  {5, 0}, {5, 2}, {6, 0}, {6, 2}, {6, 4}, {6, 6},
+  {7, 0}, {7, 2}, {7, 4}, {7, 6}, {7, 8}, {7, 10}, {7, 12}, {7, 14},
+  {8, 0}, {8, 2}, {8, 4}, {8, 6}, {8, 8}, {8, 10}, {8, 12},
+  {8, 14}, {8, 16}, {8, 18}, {8, 20}, {8, 22}, {8, 24},
+  {8, 26}, {8, 28}, {8, 30}, {9, 0}, {9, 2},
+  {9, 4}, {9, 6}, {9, 8}, {9, 10}, {9, 12}, {9, 14}, {9, 16},
+  {9, 18}, {9, 20}, {9, 22}, {9, 24}, {9, 26}, {9, 28},
+  {9, 30}, {9, 32}, {9, 34}, {9, 36}, {9, 38}, {9, 40},
+  {9, 42}, {9, 44}, {9, 46}, {9, 48}, {9, 50}, {9, 52},
+  {9, 54}, {9, 56}, {9, 58}, {9, 60}, {9, 62}
+};
+const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
+    (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
+    / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
@@ -55,204 +65,390 @@ const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
 };
 
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2, 6,                                // 0 = LOW_VAL
-  -TWO_TOKEN, 4,                       // 1 = TWO
-  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
-  8, 10,                               // 3 = HIGH_LOW
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
-  12, 14,                              // 5 = CAT_THREEFOUR
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+static const vp9_tree_index cat1[2] = {0, 0};
+static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6[28] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12,
+    14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
+
+static const int16_t zero_cost[] = {0};
+static const int16_t one_cost[] = {255, 257};
+static const int16_t two_cost[] = {255, 257};
+static const int16_t three_cost[] = {255, 257};
+static const int16_t four_cost[] = {255, 257};
+static const int16_t cat1_cost[] = {429, 431, 616, 618};
+static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
+static const int16_t cat3_cost[] = {
+  820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
+  1289, 1291
+};
+static const int16_t cat4_cost[] = {
+  1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
+  1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
+  1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
+};
+static const int16_t cat5_cost[] = {
+  1269, 1271, 1283, 1285, 1306, 1308, 1320,
+  1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
+  1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
+  1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
+  1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
+  1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
+};
+const int16_t vp9_cat6_low_cost[256] = {
+  1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
+  1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
+  1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
+  1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
+  1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
+  1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
+  1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
+  2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
+  2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
+  2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
+  2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
+  2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
+  2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
+  2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
+  2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
+  2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
+  2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
+  2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
+  2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
+  2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
+  2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
+  2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
+};
+const int16_t vp9_cat6_high_cost[128] = {
+  72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
+  1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
+  5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
+  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
+  4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
+  7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
+  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
+  7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
 };
-
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static vp9_tree_index cat1_high10[2];
-static vp9_tree_index cat2_high10[4];
-static vp9_tree_index cat3_high10[6];
-static vp9_tree_index cat4_high10[8];
-static vp9_tree_index cat5_high10[10];
-static vp9_tree_index cat6_high10[32];
-static vp9_tree_index cat1_high12[2];
-static vp9_tree_index cat2_high12[4];
-static vp9_tree_index cat3_high12[6];
-static vp9_tree_index cat4_high12[8];
-static vp9_tree_index cat5_high12[10];
-static vp9_tree_index cat6_high12[36];
+const int16_t vp9_cat6_high10_high_cost[512] = {
+  74, 894, 1185, 2005, 1450, 2270, 2561,
+  3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
+  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
+  7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
+  7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
+  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
+  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
+  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
+  11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
+  2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
+  5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
+  6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
+  6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
+  8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
+  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
+  10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
+  9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
+  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
+  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
+  9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
+  9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
+  10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
+  12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
+  8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
+  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
+  9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
+  11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
+  13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
+};
+const int16_t vp9_cat6_high12_high_cost[2048] = {
+  76, 896, 1187, 2007, 1452, 2272, 2563,
+  3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
+  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
+  7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
+  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
+  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
+  2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
+  5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
+  6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
+  6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
+  3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
+  6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
+  6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
+  5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
+  7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
+  5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
+  8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
+  8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
+  9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
+  12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
+  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
+  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
+  12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
+  5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
+  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
+  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
+  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
+  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
+  12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
+  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
+  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
+  5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
+  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
+  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
+  9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
+  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
+  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
+  10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
+  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
+  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
+  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
+  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
+  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
+  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
+  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
+  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
+  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
+  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
+  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
+  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
+  14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
+  13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
+  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
+  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
+  11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
+  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
+  10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
+  13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
+  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
+  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
+  13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
+  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
+  13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
+  16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
+  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
+  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
+  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
+  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
+  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
+  17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
+  13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
+  14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
+  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
+  15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
+  16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
+};
 #endif
 
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
-
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
-  }
-
-  p[0] = p[1] = 0;
-}
-
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
 #if CONFIG_VP9_HIGHBITDEPTH
-  init_bit_tree(cat1_high10, 1);
-  init_bit_tree(cat2_high10, 2);
-  init_bit_tree(cat3_high10, 3);
-  init_bit_tree(cat4_high10, 4);
-  init_bit_tree(cat5_high10, 5);
-  init_bit_tree(cat6_high10, 16);
-  init_bit_tree(cat1_high12, 1);
-  init_bit_tree(cat2_high12, 2);
-  init_bit_tree(cat3_high12, 3);
-  init_bit_tree(cat4_high12, 4);
-  init_bit_tree(cat5_high12, 5);
-  init_bit_tree(cat6_high12, 18);
+static const vp9_tree_index cat1_high10[2] = {0, 0};
+static const vp9_tree_index cat2_high10[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high10[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high10[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high10[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high10[32] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 0, 0};
+static const vp9_tree_index cat1_high12[2] = {0, 0};
+static const vp9_tree_index cat2_high12[4] = {2, 2, 0, 0};
+static const vp9_tree_index cat3_high12[6] = {2, 2, 4, 4, 0, 0};
+static const vp9_tree_index cat4_high12[8] = {2, 2, 4, 4, 6, 6, 0, 0};
+static const vp9_tree_index cat5_high12[10] = {2, 2, 4, 4, 6, 6, 8, 8, 0, 0};
+static const vp9_tree_index cat6_high12[36] = {2, 2, 4, 4, 6, 6, 8, 8, 10, 10,
+  12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 28, 28,
+  30, 30, 32, 32, 34, 34, 0, 0};
 #endif
-}
 
 const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},                              // ZERO_TOKEN
-  {0, 0, 0, 1},                              // ONE_TOKEN
-  {0, 0, 0, 2},                              // TWO_TOKEN
-  {0, 0, 0, 3},                              // THREE_TOKEN
-  {0, 0, 0, 4},                              // FOUR_TOKEN
-  {cat1, vp9_cat1_prob, 1,  CAT1_MIN_VAL},   // CATEGORY1_TOKEN
-  {cat2, vp9_cat2_prob, 2,  CAT2_MIN_VAL},   // CATEGORY2_TOKEN
-  {cat3, vp9_cat3_prob, 3,  CAT3_MIN_VAL},   // CATEGORY3_TOKEN
-  {cat4, vp9_cat4_prob, 4,  CAT4_MIN_VAL},   // CATEGORY4_TOKEN
-  {cat5, vp9_cat5_prob, 5,  CAT5_MIN_VAL},   // CATEGORY5_TOKEN
-  {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL},   // CATEGORY6_TOKEN
-  {0, 0, 0, 0}                               // EOB_TOKEN
+  {0, 0, 0, 0, zero_cost},                             // ZERO_TOKEN
+  {0, 0, 0, 1, one_cost},                              // ONE_TOKEN
+  {0, 0, 0, 2, two_cost},                              // TWO_TOKEN
+  {0, 0, 0, 3, three_cost},                            // THREE_TOKEN
+  {0, 0, 0, 4, four_cost},                             // FOUR_TOKEN
+  {cat1, vp9_cat1_prob, 1,  CAT1_MIN_VAL, cat1_cost},  // CATEGORY1_TOKEN
+  {cat2, vp9_cat2_prob, 2,  CAT2_MIN_VAL, cat2_cost},  // CATEGORY2_TOKEN
+  {cat3, vp9_cat3_prob, 3,  CAT3_MIN_VAL, cat3_cost},  // CATEGORY3_TOKEN
+  {cat4, vp9_cat4_prob, 4,  CAT4_MIN_VAL, cat4_cost},  // CATEGORY4_TOKEN
+  {cat5, vp9_cat5_prob, 5,  CAT5_MIN_VAL, cat5_cost},  // CATEGORY5_TOKEN
+  {cat6, vp9_cat6_prob, 14, CAT6_MIN_VAL, 0},          // CATEGORY6_TOKEN
+  {0, 0, 0, 0, zero_cost}                              // EOB_TOKEN
 };
 
 #if CONFIG_VP9_HIGHBITDEPTH
 const vp9_extra_bit vp9_extra_bits_high10[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},                                            // ZERO_TOKEN
-  {0, 0, 0, 1},                                            // ONE_TOKEN
-  {0, 0, 0, 2},                                            // TWO_TOKEN
-  {0, 0, 0, 3},                                            // THREE_TOKEN
-  {0, 0, 0, 4},                                            // FOUR_TOKEN
-  {cat1_high10, vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL},   // CATEGORY1_TOKEN
-  {cat2_high10, vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL},   // CATEGORY2_TOKEN
-  {cat3_high10, vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL},   // CATEGORY3_TOKEN
-  {cat4_high10, vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL},   // CATEGORY4_TOKEN
-  {cat5_high10, vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL},   // CATEGORY5_TOKEN
-  {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL},   // CATEGORY6_TOKEN
-  {0, 0, 0, 0}                                             // EOB_TOKEN
+  {0, 0, 0, 0, zero_cost},                                           // ZERO
+  {0, 0, 0, 1, one_cost},                                            // ONE
+  {0, 0, 0, 2, two_cost},                                            // TWO
+  {0, 0, 0, 3, three_cost},                                          // THREE
+  {0, 0, 0, 4, four_cost},                                           // FOUR
+  {cat1_high10, vp9_cat1_prob_high10, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high10, vp9_cat2_prob_high10, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high10, vp9_cat3_prob_high10, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high10, vp9_cat4_prob_high10, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high10, vp9_cat5_prob_high10, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high10, vp9_cat6_prob_high10, 16, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                            // EOB
 };
 const vp9_extra_bit vp9_extra_bits_high12[ENTROPY_TOKENS] = {
-  {0, 0, 0, 0},                                            // ZERO_TOKEN
-  {0, 0, 0, 1},                                            // ONE_TOKEN
-  {0, 0, 0, 2},                                            // TWO_TOKEN
-  {0, 0, 0, 3},                                            // THREE_TOKEN
-  {0, 0, 0, 4},                                            // FOUR_TOKEN
-  {cat1_high12, vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL},   // CATEGORY1_TOKEN
-  {cat2_high12, vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL},   // CATEGORY2_TOKEN
-  {cat3_high12, vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL},   // CATEGORY3_TOKEN
-  {cat4_high12, vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL},   // CATEGORY4_TOKEN
-  {cat5_high12, vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL},   // CATEGORY5_TOKEN
-  {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL},   // CATEGORY6_TOKEN
-  {0, 0, 0, 0}                                             // EOB_TOKEN
+  {0, 0, 0, 0, zero_cost},                                           // ZERO
+  {0, 0, 0, 1, one_cost},                                            // ONE
+  {0, 0, 0, 2, two_cost},                                            // TWO
+  {0, 0, 0, 3, three_cost},                                          // THREE
+  {0, 0, 0, 4, four_cost},                                           // FOUR
+  {cat1_high12, vp9_cat1_prob_high12, 1,  CAT1_MIN_VAL, cat1_cost},  // CAT1
+  {cat2_high12, vp9_cat2_prob_high12, 2,  CAT2_MIN_VAL, cat2_cost},  // CAT2
+  {cat3_high12, vp9_cat3_prob_high12, 3,  CAT3_MIN_VAL, cat3_cost},  // CAT3
+  {cat4_high12, vp9_cat4_prob_high12, 4,  CAT4_MIN_VAL, cat4_cost},  // CAT4
+  {cat5_high12, vp9_cat5_prob_high12, 5,  CAT5_MIN_VAL, cat5_cost},  // CAT5
+  {cat6_high12, vp9_cat6_prob_high12, 18, CAT6_MIN_VAL, 0},          // CAT6
+  {0, 0, 0, 0, zero_cost}                                            // EOB
 };
 #endif
 
-struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS];
-
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
-static void tokenize_init_one(TOKENVALUE *t, const vp9_extra_bit *const e,
-                              int16_t *value_cost, int max_value) {
-  int i = -max_value;
-  int sign = 1;
-
-  do {
-    if (!i)
-      sign = 0;
-
-    {
-      const int a = sign ? -i : i;
-      int eb = sign;
-
-      if (a > 4) {
-        int j = 4;
-
-        while (++j < 11  &&  e[j].base_val <= a) {}
-
-        t[i].token = --j;
-        eb |= (a - e[j].base_val) << 1;
-      } else {
-        t[i].token = a;
-      }
-      t[i].extra = eb;
-    }
-
-    // initialize the cost for extra bits for all possible coefficient value.
-    {
-      int cost = 0;
-      const vp9_extra_bit *p = &e[t[i].token];
-
-      if (p->base_val) {
-        const int extra = t[i].extra;
-        const int length = p->len;
-
-        if (length)
-          cost += treed_cost(p->tree, p->prob, extra >> 1, length);
-
-        cost += vp9_cost_bit(vp9_prob_half, extra & 1); /* sign */
-        value_cost[i] = cost;
-      }
-    }
-  } while (++i < max_value);
-}
-
-void vp9_tokenize_initialize() {
-  vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
+const struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS] = {
+  {2, 2}, {6, 3}, {28, 5}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, {124, 7},
+  {125, 7}, {126, 7}, {127, 7}, {0, 1}
+};
 
-  tokenize_init_one(dct_value_tokens + DCT_MAX_VALUE, vp9_extra_bits,
-                    dct_value_cost + DCT_MAX_VALUE, DCT_MAX_VALUE);
-#if CONFIG_VP9_HIGHBITDEPTH
-  vp9_dct_value_tokens_high10_ptr = dct_value_tokens_high10 +
-      DCT_MAX_VALUE_HIGH10;
-  vp9_dct_value_cost_high10_ptr = dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10;
-
-  tokenize_init_one(dct_value_tokens_high10 + DCT_MAX_VALUE_HIGH10,
-                    vp9_extra_bits_high10,
-                    dct_value_cost_high10 + DCT_MAX_VALUE_HIGH10,
-                    DCT_MAX_VALUE_HIGH10);
-  vp9_dct_value_tokens_high12_ptr = dct_value_tokens_high12 +
-      DCT_MAX_VALUE_HIGH12;
-  vp9_dct_value_cost_high12_ptr = dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12;
-
-  tokenize_init_one(dct_value_tokens_high12 + DCT_MAX_VALUE_HIGH12,
-                    vp9_extra_bits_high12,
-                    dct_value_cost_high12 + DCT_MAX_VALUE_HIGH12,
-                    DCT_MAX_VALUE_HIGH12);
-#endif
-}
 
 struct tokenize_b_args {
   VP9_COMP *cpi;
-  MACROBLOCKD *xd;
+  ThreadData *td;
   TOKENEXTRA **tp;
 };
 
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
-  MACROBLOCKD *const xd = args->xd;
-  struct macroblock_plane *p = &args->cpi->mb.plane[plane];
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
@@ -294,12 +490,14 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   VP9_COMP *cpi = args->cpi;
-  MACROBLOCKD *xd = args->xd;
+  ThreadData *const td = args->td;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   TOKENEXTRA **tp = args->tp;
   uint8_t token_cache[32 * 32];
-  struct macroblock_plane *p = &cpi->mb.plane[plane];
+  struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
-  MB_MODE_INFO *mbmi = &xd->mi[0].src_mi->mbmi;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   int pt; /* near block/prev token context index */
   int c;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
@@ -311,15 +509,15 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   const scan_order *so;
   const int ref = is_inter_block(mbmi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      cpi->coef_counts[tx_size][type][ref];
+      td->rd_counts.coef_counts[tx_size][type][ref];
   vp9_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
-      cpi->common.fc.coef_probs[tx_size][type][ref];
+      cpi->common.fc->coef_probs[tx_size][type][ref];
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
-      cpi->common.counts.eob_branch[tx_size][type][ref];
+      td->counts->eob_branch[tx_size][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
-  const TOKENVALUE *dct_value_tokens;
-
+  int16_t token;
+  EXTRABIT extra;
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
@@ -329,17 +527,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   scan = so->scan;
   nb = so->neighbors;
   c = 0;
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (cpi->common.profile >= PROFILE_2) {
-    dct_value_tokens = (cpi->common.bit_depth == VPX_BITS_10 ?
-                        vp9_dct_value_tokens_high10_ptr :
-                        vp9_dct_value_tokens_high12_ptr);
-  } else {
-    dct_value_tokens = vp9_dct_value_tokens_ptr;
-  }
-#else
-  dct_value_tokens = vp9_dct_value_tokens_ptr;
-#endif
 
   while (c < eob) {
     int v = 0;
@@ -358,14 +545,13 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
       v = qcoeff[scan[c]];
     }
 
-    add_token(&t, coef_probs[band[c]][pt],
-              dct_value_tokens[v].extra,
-              (uint8_t)dct_value_tokens[v].token,
-              (uint8_t)skip_eob,
-              counts[band[c]][pt]);
+    vp9_get_token_extra(v, &token, &extra);
+
+    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
+              (uint8_t)skip_eob, counts[band[c]][pt]);
     eob_branch[band[c]][pt] += !skip_eob;
 
-    token_cache[scan[c]] = vp9_pt_energy_class[dct_value_tokens[v].token];
+    token_cache[scan[c]] = vp9_pt_energy_class[token];
     ++c;
     pt = get_coef_context(nb, token_cache, c);
   }
@@ -421,30 +607,27 @@ int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   return result;
 }
 
-void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize) {
+void vp9_tokenize_sb(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                     int dry_run, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
-  TOKENEXTRA *t_backup = *t;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp9_get_skip_context(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t};
+  struct tokenize_b_args arg = {cpi, td, t};
   if (mbmi->skip) {
     if (!dry_run)
-      cm->counts.skip[ctx][1] += skip_inc;
+      td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
-    if (dry_run)
-      *t = t_backup;
     return;
   }
 
   if (!dry_run) {
-    cm->counts.skip[ctx][0] += skip_inc;
+    td->counts->skip[ctx][0] += skip_inc;
     vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
   } else {
     vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
-    *t = t_backup;
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
index 825252bac22..81cc2e13f99 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_tokenize.h
@@ -20,41 +20,39 @@
 extern "C" {
 #endif
 
-void vp9_tokenize_initialize();
-
 #define EOSB_TOKEN 127     // Not signalled, encoder only
 
-typedef struct {
-  int16_t token;
 #if CONFIG_VP9_HIGHBITDEPTH
-  int32_t extra;
+  typedef int32_t EXTRABIT;
 #else
-  int16_t extra;
+  typedef int16_t EXTRABIT;
 #endif
+
+
+typedef struct {
+  int16_t token;
+  EXTRABIT extra;
 } TOKENVALUE;
 
 typedef struct {
   const vp9_prob *context_tree;
-#if CONFIG_VP9_HIGHBITDEPTH
-  int32_t extra;
-#else
-  int16_t         extra;
-#endif
-  uint8_t         token;
-  uint8_t         skip_eob_node;
+  EXTRABIT extra;
+  uint8_t token;
+  uint8_t skip_eob_node;
 } TOKENEXTRA;
 
 extern const vp9_tree_index vp9_coef_tree[];
 extern const vp9_tree_index vp9_coef_con_tree[];
-extern struct vp9_token vp9_coef_encodings[];
+extern const struct vp9_token vp9_coef_encodings[];
 
 int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 int vp9_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
 struct VP9_COMP;
+struct ThreadData;
 
-void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
-                     BLOCK_SIZE bsize);
+void vp9_tokenize_sb(struct VP9_COMP *cpi, struct ThreadData *td,
+                     TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
 
 extern const int16_t *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
@@ -62,13 +60,51 @@ extern const int16_t *vp9_dct_value_cost_ptr;
  *  fields are not.
  */
 extern const TOKENVALUE *vp9_dct_value_tokens_ptr;
+extern const TOKENVALUE *vp9_dct_cat_lt_10_value_tokens;
+extern const int16_t vp9_cat6_low_cost[256];
+extern const int16_t vp9_cat6_high_cost[128];
+extern const int16_t vp9_cat6_high10_high_cost[512];
+extern const int16_t vp9_cat6_high12_high_cost[2048];
+static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits,
+                                   const int16_t *cat6_high_table) {
+  if (token != CATEGORY6_TOKEN)
+    return vp9_extra_bits[token].cost[extrabits];
+  return vp9_cat6_low_cost[extrabits & 0xff]
+      + cat6_high_table[extrabits >> 8];
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
-extern const int16_t *vp9_dct_value_cost_high10_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high10_ptr;
-extern const int16_t *vp9_dct_value_cost_high12_ptr;
-extern const TOKENVALUE *vp9_dct_value_tokens_high12_ptr;
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+  return bit_depth == 8 ? vp9_cat6_high_cost
+      : (bit_depth == 10 ? vp9_cat6_high10_high_cost :
+         vp9_cat6_high12_high_cost);
+}
+#else
+static INLINE const int16_t* vp9_get_high_cost_table(int bit_depth) {
+  (void) bit_depth;
+  return vp9_cat6_high_cost;
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+static INLINE void vp9_get_token_extra(int v, int16_t *token, EXTRABIT *extra) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    *token = CATEGORY6_TOKEN;
+    if (v >= CAT6_MIN_VAL)
+      *extra = 2 * v - 2 * CAT6_MIN_VAL;
+    else
+      *extra = -2 * v - 2 * CAT6_MIN_VAL + 1;
+    return;
+  }
+  *token = vp9_dct_cat_lt_10_value_tokens[v].token;
+  *extra = vp9_dct_cat_lt_10_value_tokens[v].extra;
+}
+static INLINE int16_t vp9_get_token(int v) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL)
+    return 10;
+  return vp9_dct_cat_lt_10_value_tokens[v].token;
+}
+
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
index 4555bde1e7e..f38f96d6c26 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_variance.c
@@ -145,7 +145,7 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
   const uint8_t *second_pred) { \
   uint16_t fdata3[(H + 1) * W]; \
   uint8_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, H * W); \
+  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
 \
   var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
                                     BILINEAR_FILTERS_2TAP(xoffset)); \
@@ -298,8 +298,8 @@ void highbd_variance(const uint8_t *a8, int  a_stride,
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = sse_long;
-  *sum = sum_long;
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
 }
 
 void highbd_10_variance(const uint8_t *a8, int  a_stride,
@@ -309,8 +309,8 @@ void highbd_10_variance(const uint8_t *a8, int  a_stride,
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
 }
 
 void highbd_12_variance(const uint8_t *a8, int  a_stride,
@@ -320,8 +320,8 @@ void highbd_12_variance(const uint8_t *a8, int  a_stride,
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
 }
 
 static void highbd_var_filter_block2d_bil_first_pass(
@@ -464,7 +464,7 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
   const uint8_t *second_pred) { \
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
                                            W, BILINEAR_FILTERS_2TAP(xoffset)); \
@@ -486,7 +486,7 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
   const uint8_t *second_pred) { \
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
                                            W, BILINEAR_FILTERS_2TAP(xoffset)); \
@@ -508,7 +508,7 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
   const uint8_t *second_pred) { \
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
   highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
                                            W, BILINEAR_FILTERS_2TAP(xoffset)); \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index ca6cf1ac916..4672aa6b8cf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -11,6 +11,83 @@
 #include <emmintrin.h>
 #include "vpx_ports/mem.h"
 
+void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0  = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
 
 unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
@@ -38,3 +115,307 @@ unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
   avg = _mm_extract_epi16(s0, 0);
   return (avg + 32) >> 6;
 }
+
+unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0  = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
+
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b7 = _mm_add_epi16(a1, a5);
+    b3 = _mm_add_epi16(a2, a6);
+    b4 = _mm_add_epi16(a3, a7);
+    b2 = _mm_sub_epi16(a0, a4);
+    b6 = _mm_sub_epi16(a1, a5);
+    b1 = _mm_sub_epi16(a2, a6);
+    b5 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[1] = _mm_unpackhi_epi64(b0, b1);
+    in[2] = _mm_unpacklo_epi64(b2, b3);
+    in[3] = _mm_unpackhi_epi64(b2, b3);
+    in[4] = _mm_unpacklo_epi64(b4, b5);
+    in[5] = _mm_unpackhi_epi64(b4, b5);
+    in[6] = _mm_unpacklo_epi64(b6, b7);
+    in[7] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+                           int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff0 = _mm_srai_epi16(coeff0, 1);
+    coeff1 = _mm_srai_epi16(coeff1, 1);
+    _mm_store_si128((__m128i *)coeff, coeff0);
+    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    coeff2 = _mm_srai_epi16(coeff2, 1);
+    coeff3 = _mm_srai_epi16(coeff3, 1);
+    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+  }
+}
+
+int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+  int i;
+  __m128i sum = _mm_load_si128((const __m128i *)coeff);
+  __m128i sign = _mm_srai_epi16(sum, 15);
+  __m128i val = _mm_xor_si128(sum, sign);
+  sum = _mm_sub_epi16(val, sign);
+  coeff += 8;
+
+  for (i = 8; i < length; i += 8) {
+    __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    sign = _mm_srai_epi16(src_line, 15);
+    val = _mm_xor_si128(src_line, sign);
+    val = _mm_sub_epi16(val, sign);
+    sum = _mm_add_epi16(sum, val);
+    coeff += 8;
+  }
+
+  val = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, val);
+
+  return _mm_extract_epi16(sum, 0);
+}
+
+void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
+                          const int ref_stride, const int height) {
+  int idx;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+
+  for (idx = 1; idx < height_1; idx += 2) {
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+  }
+
+  src_line = _mm_loadu_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+
+  if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
+
+  _mm_storeu_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_load_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  int i;
+
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_load_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}
+
+int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,
+                        const int bwl) {
+  int idx;
+  int width = 4 << bwl;
+  int16_t mean;
+  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
+  __m128i v1 = _mm_load_si128((const __m128i *)src);
+  __m128i diff = _mm_subs_epi16(v0, v1);
+  __m128i sum = diff;
+  __m128i sse = _mm_madd_epi16(diff, diff);
+
+  ref += 8;
+  src += 8;
+
+  for (idx = 8; idx < width; idx += 8) {
+    v0 = _mm_loadu_si128((const __m128i *)ref);
+    v1 = _mm_load_si128((const __m128i *)src);
+    diff = _mm_subs_epi16(v0, v1);
+
+    sum = _mm_add_epi16(sum, diff);
+    v0  = _mm_madd_epi16(diff, diff);
+    sse = _mm_add_epi32(sse, v0);
+
+    ref += 8;
+    src += 8;
+  }
+
+  v0  = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, v0);
+  v0  = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, v0);
+
+  v1  = _mm_srli_si128(sse, 8);
+  sse = _mm_add_epi32(sse, v1);
+  v1  = _mm_srli_epi64(sse, 32);
+  sse = _mm_add_epi32(sse, v1);
+
+  mean = _mm_extract_epi16(sum, 0);
+
+  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
index 9ea22fed2b7..66827ad8037 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
@@ -13,13 +13,14 @@
 #include "vpx_ports/mem.h"
 
 #define pair256_set_epi16(a, b) \
-  _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
 #define pair256_set_epi32(a, b) \
-  _mm256_set_epi32(b, a, b, a, b, a, b, a)
-
-
-
+  _mm256_set_epi32((int)(b), (int)(a), (int)(b), (int)(a), \
+                   (int)(b), (int)(a), (int)(b), (int)(a))
 
 #if FDCT32x32_HIGH_PRECISION
 static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
@@ -50,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input,
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
   const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
   const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
index 42fdbbdc5ce..099993aa6a0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -10,30 +10,50 @@
 
 #include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vp9/encoder/x86/vp9_dct_sse2.h"
+#include "vp9/encoder/vp9_dct.h"
 #include "vpx_ports/mem.h"
 
-#define pair_set_epi32(a, b) \
-  _mm_set_epi32(b, a, b, a)
-
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
 #if FDCT32x32_HIGH_PRECISION
-static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
-  __m128i buf0, buf1;
-  buf0 = _mm_mul_epu32(a, b);
-  a = _mm_srli_epi64(a, 32);
-  b = _mm_srli_epi64(b, 32);
-  buf1 = _mm_mul_epu32(a, b);
-  return _mm_add_epi64(buf0, buf1);
+void vp9_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vp9_fdct32(temp_in, temp_out, 0);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
 }
-
-static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
-  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
-  return _mm_unpacklo_epi64(buf0, buf1);
+  #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vp9_fdct32x32_rows_c
+#else
+void vp9_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+    int i, j;
+    for (i = 0; i < 32; ++i) {
+      tran_high_t temp_in[32], temp_out[32];
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = intermediate[j * 32 + i];
+      vp9_fdct32(temp_in, temp_out, 1);
+      for (j = 0; j < 32; ++j)
+        out[j + i * 32] = temp_out[j];
+    }
 }
-#endif
+  #define HIGH_FDCT32x32_2D_C vp9_highbd_fdct32x32_rd_c
+  #define HIGH_FDCT32x32_2D_ROWS_C vp9_fdct32x32_rd_rows_c
+#endif  // FDCT32x32_HIGH_PRECISION
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif  // DCT_HIGH_BIT_DEPTH
+
 
 void FDCT32x32_2D(const int16_t *input,
-                  int16_t *output_org, int stride) {
+                  tran_low_t *output_org, int stride) {
   // Calculate pre-multiplied strides
   const int str1 = stride;
   const int str2 = 2 * stride;
@@ -44,7 +64,7 @@ void FDCT32x32_2D(const int16_t *input,
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64,   cospi_24_64);
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
@@ -84,6 +104,9 @@ void FDCT32x32_2D(const int16_t *input,
   const __m128i kOne  = _mm_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
   for (pass = 0; pass < 2; ++pass) {
     // We process eight columns (transposed rows in second pass) at a time.
     int column_start;
@@ -237,14 +260,23 @@ void FDCT32x32_2D(const int16_t *input,
           __m128i in29  = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
           __m128i in30  = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
           __m128i in31  = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
-          step1[ 0] = _mm_add_epi16(in00, in31);
-          step1[ 1] = _mm_add_epi16(in01, in30);
-          step1[ 2] = _mm_add_epi16(in02, in29);
-          step1[ 3] = _mm_add_epi16(in03, in28);
-          step1[28] = _mm_sub_epi16(in03, in28);
-          step1[29] = _mm_sub_epi16(in02, in29);
-          step1[30] = _mm_sub_epi16(in01, in30);
-          step1[31] = _mm_sub_epi16(in00, in31);
+          step1[0] = ADD_EPI16(in00, in31);
+          step1[1] = ADD_EPI16(in01, in30);
+          step1[2] = ADD_EPI16(in02, in29);
+          step1[3] = ADD_EPI16(in03, in28);
+          step1[28] = SUB_EPI16(in03, in28);
+          step1[29] = SUB_EPI16(in02, in29);
+          step1[30] = SUB_EPI16(in01, in30);
+          step1[31] = SUB_EPI16(in00, in31);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[0], &step1[1], &step1[2],
+                                             &step1[3], &step1[28], &step1[29],
+                                             &step1[30], &step1[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           __m128i in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 32));
@@ -255,14 +287,23 @@ void FDCT32x32_2D(const int16_t *input,
           __m128i in25  = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
           __m128i in26  = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
           __m128i in27  = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
-          step1[ 4] = _mm_add_epi16(in04, in27);
-          step1[ 5] = _mm_add_epi16(in05, in26);
-          step1[ 6] = _mm_add_epi16(in06, in25);
-          step1[ 7] = _mm_add_epi16(in07, in24);
-          step1[24] = _mm_sub_epi16(in07, in24);
-          step1[25] = _mm_sub_epi16(in06, in25);
-          step1[26] = _mm_sub_epi16(in05, in26);
-          step1[27] = _mm_sub_epi16(in04, in27);
+          step1[4] = ADD_EPI16(in04, in27);
+          step1[5] = ADD_EPI16(in05, in26);
+          step1[6] = ADD_EPI16(in06, in25);
+          step1[7] = ADD_EPI16(in07, in24);
+          step1[24] = SUB_EPI16(in07, in24);
+          step1[25] = SUB_EPI16(in06, in25);
+          step1[26] = SUB_EPI16(in05, in26);
+          step1[27] = SUB_EPI16(in04, in27);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[4], &step1[5], &step1[6],
+                                             &step1[7], &step1[24], &step1[25],
+                                             &step1[26], &step1[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           __m128i in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 32));
@@ -273,14 +314,23 @@ void FDCT32x32_2D(const int16_t *input,
           __m128i in21  = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
           __m128i in22  = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
           __m128i in23  = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
-          step1[ 8] = _mm_add_epi16(in08, in23);
-          step1[ 9] = _mm_add_epi16(in09, in22);
-          step1[10] = _mm_add_epi16(in10, in21);
-          step1[11] = _mm_add_epi16(in11, in20);
-          step1[20] = _mm_sub_epi16(in11, in20);
-          step1[21] = _mm_sub_epi16(in10, in21);
-          step1[22] = _mm_sub_epi16(in09, in22);
-          step1[23] = _mm_sub_epi16(in08, in23);
+          step1[8] = ADD_EPI16(in08, in23);
+          step1[9] = ADD_EPI16(in09, in22);
+          step1[10] = ADD_EPI16(in10, in21);
+          step1[11] = ADD_EPI16(in11, in20);
+          step1[20] = SUB_EPI16(in11, in20);
+          step1[21] = SUB_EPI16(in10, in21);
+          step1[22] = SUB_EPI16(in09, in22);
+          step1[23] = SUB_EPI16(in08, in23);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[8], &step1[9], &step1[10],
+                                             &step1[11], &step1[20], &step1[21],
+                                             &step1[22], &step1[23]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           __m128i in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
@@ -291,34 +341,57 @@ void FDCT32x32_2D(const int16_t *input,
           __m128i in17  = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
           __m128i in18  = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
           __m128i in19  = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
-          step1[12] = _mm_add_epi16(in12, in19);
-          step1[13] = _mm_add_epi16(in13, in18);
-          step1[14] = _mm_add_epi16(in14, in17);
-          step1[15] = _mm_add_epi16(in15, in16);
-          step1[16] = _mm_sub_epi16(in15, in16);
-          step1[17] = _mm_sub_epi16(in14, in17);
-          step1[18] = _mm_sub_epi16(in13, in18);
-          step1[19] = _mm_sub_epi16(in12, in19);
+          step1[12] = ADD_EPI16(in12, in19);
+          step1[13] = ADD_EPI16(in13, in18);
+          step1[14] = ADD_EPI16(in14, in17);
+          step1[15] = ADD_EPI16(in15, in16);
+          step1[16] = SUB_EPI16(in15, in16);
+          step1[17] = SUB_EPI16(in14, in17);
+          step1[18] = SUB_EPI16(in13, in18);
+          step1[19] = SUB_EPI16(in12, in19);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1[12], &step1[13], &step1[14],
+                                             &step1[15], &step1[16], &step1[17],
+                                             &step1[18], &step1[19]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
       }
       // Stage 2
       {
-        step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
-        step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
-        step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
-        step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
-        step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
-        step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
-        step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
-        step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
-        step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
-        step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
-        step2[10] = _mm_sub_epi16(step1[5], step1[10]);
-        step2[11] = _mm_sub_epi16(step1[4], step1[11]);
-        step2[12] = _mm_sub_epi16(step1[3], step1[12]);
-        step2[13] = _mm_sub_epi16(step1[2], step1[13]);
-        step2[14] = _mm_sub_epi16(step1[1], step1[14]);
-        step2[15] = _mm_sub_epi16(step1[0], step1[15]);
+        step2[0] = ADD_EPI16(step1[0], step1[15]);
+        step2[1] = ADD_EPI16(step1[1], step1[14]);
+        step2[2] = ADD_EPI16(step1[2], step1[13]);
+        step2[3] = ADD_EPI16(step1[3], step1[12]);
+        step2[4] = ADD_EPI16(step1[4], step1[11]);
+        step2[5] = ADD_EPI16(step1[5], step1[10]);
+        step2[6] = ADD_EPI16(step1[6], step1[ 9]);
+        step2[7] = ADD_EPI16(step1[7], step1[ 8]);
+        step2[8] = SUB_EPI16(step1[7], step1[ 8]);
+        step2[9] = SUB_EPI16(step1[6], step1[ 9]);
+        step2[10] = SUB_EPI16(step1[5], step1[10]);
+        step2[11] = SUB_EPI16(step1[4], step1[11]);
+        step2[12] = SUB_EPI16(step1[3], step1[12]);
+        step2[13] = SUB_EPI16(step1[2], step1[13]);
+        step2[14] = SUB_EPI16(step1[1], step1[14]);
+        step2[15] = SUB_EPI16(step1[0], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
@@ -387,6 +460,18 @@ void FDCT32x32_2D(const int16_t *input,
         step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
         step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
         step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step2[20], &step2[21], &step2[22],
+                                           &step2[23], &step2[24], &step2[25],
+                                           &step2[26], &step2[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
 
 #if !FDCT32x32_HIGH_PRECISION
@@ -426,49 +511,63 @@ void FDCT32x32_2D(const int16_t *input,
         __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
         __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
 
-        step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
-        step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
-        step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
-        step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
-        step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
-        step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
-        step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
-        step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
-        step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
-        step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
-        step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
-        step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
-        step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
-        step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
-        step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
-        step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
-        step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
-        step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
-        step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
-        step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
-        step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
-        step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
-        step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
-        step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
-        step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
-        step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
-        step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
-        step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
-        step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
-        step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
-        step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
-        step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
-
-        step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
-        step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
-        step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
-        step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
-        step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
-        step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
-        step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
-        step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
-        step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
-        step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
+        step2[0] = SUB_EPI16(step2[ 0], s3_00_0);
+        step2[1] = SUB_EPI16(step2[ 1], s3_01_0);
+        step2[2] = SUB_EPI16(step2[ 2], s3_02_0);
+        step2[3] = SUB_EPI16(step2[ 3], s3_03_0);
+        step2[4] = SUB_EPI16(step2[ 4], s3_04_0);
+        step2[5] = SUB_EPI16(step2[ 5], s3_05_0);
+        step2[6] = SUB_EPI16(step2[ 6], s3_06_0);
+        step2[7] = SUB_EPI16(step2[ 7], s3_07_0);
+        step2[8] = SUB_EPI16(step2[ 8], s2_08_0);
+        step2[9] = SUB_EPI16(step2[ 9], s2_09_0);
+        step2[10] = SUB_EPI16(step2[10], s3_10_0);
+        step2[11] = SUB_EPI16(step2[11], s3_11_0);
+        step2[12] = SUB_EPI16(step2[12], s3_12_0);
+        step2[13] = SUB_EPI16(step2[13], s3_13_0);
+        step2[14] = SUB_EPI16(step2[14], s2_14_0);
+        step2[15] = SUB_EPI16(step2[15], s2_15_0);
+        step1[16] = SUB_EPI16(step1[16], s3_16_0);
+        step1[17] = SUB_EPI16(step1[17], s3_17_0);
+        step1[18] = SUB_EPI16(step1[18], s3_18_0);
+        step1[19] = SUB_EPI16(step1[19], s3_19_0);
+        step2[20] = SUB_EPI16(step2[20], s3_20_0);
+        step2[21] = SUB_EPI16(step2[21], s3_21_0);
+        step2[22] = SUB_EPI16(step2[22], s3_22_0);
+        step2[23] = SUB_EPI16(step2[23], s3_23_0);
+        step2[24] = SUB_EPI16(step2[24], s3_24_0);
+        step2[25] = SUB_EPI16(step2[25], s3_25_0);
+        step2[26] = SUB_EPI16(step2[26], s3_26_0);
+        step2[27] = SUB_EPI16(step2[27], s3_27_0);
+        step1[28] = SUB_EPI16(step1[28], s3_28_0);
+        step1[29] = SUB_EPI16(step1[29], s3_29_0);
+        step1[30] = SUB_EPI16(step1[30], s3_30_0);
+        step1[31] = SUB_EPI16(step1[31], s3_31_0);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x32(
+            &step2[0], &step2[1], &step2[2], &step2[3],
+            &step2[4], &step2[5], &step2[6], &step2[7],
+            &step2[8], &step2[9], &step2[10], &step2[11],
+            &step2[12], &step2[13], &step2[14], &step2[15],
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        step2[0] = _mm_add_epi16(step2[ 0], kOne);
+        step2[1] = _mm_add_epi16(step2[ 1], kOne);
+        step2[2] = _mm_add_epi16(step2[ 2], kOne);
+        step2[3] = _mm_add_epi16(step2[ 3], kOne);
+        step2[4] = _mm_add_epi16(step2[ 4], kOne);
+        step2[5] = _mm_add_epi16(step2[ 5], kOne);
+        step2[6] = _mm_add_epi16(step2[ 6], kOne);
+        step2[7] = _mm_add_epi16(step2[ 7], kOne);
+        step2[8] = _mm_add_epi16(step2[ 8], kOne);
+        step2[9] = _mm_add_epi16(step2[ 9], kOne);
         step2[10] = _mm_add_epi16(step2[10], kOne);
         step2[11] = _mm_add_epi16(step2[11], kOne);
         step2[12] = _mm_add_epi16(step2[12], kOne);
@@ -492,16 +591,16 @@ void FDCT32x32_2D(const int16_t *input,
         step1[30] = _mm_add_epi16(step1[30], kOne);
         step1[31] = _mm_add_epi16(step1[31], kOne);
 
-        step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
-        step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
-        step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
-        step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
-        step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
-        step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
-        step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
-        step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
-        step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
-        step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
+        step2[0] = _mm_srai_epi16(step2[ 0], 2);
+        step2[1] = _mm_srai_epi16(step2[ 1], 2);
+        step2[2] = _mm_srai_epi16(step2[ 2], 2);
+        step2[3] = _mm_srai_epi16(step2[ 3], 2);
+        step2[4] = _mm_srai_epi16(step2[ 4], 2);
+        step2[5] = _mm_srai_epi16(step2[ 5], 2);
+        step2[6] = _mm_srai_epi16(step2[ 6], 2);
+        step2[7] = _mm_srai_epi16(step2[ 7], 2);
+        step2[8] = _mm_srai_epi16(step2[ 8], 2);
+        step2[9] = _mm_srai_epi16(step2[ 9], 2);
         step2[10] = _mm_srai_epi16(step2[10], 2);
         step2[11] = _mm_srai_epi16(step2[11], 2);
         step2[12] = _mm_srai_epi16(step2[12], 2);
@@ -525,21 +624,33 @@ void FDCT32x32_2D(const int16_t *input,
         step1[30] = _mm_srai_epi16(step1[30], 2);
         step1[31] = _mm_srai_epi16(step1[31], 2);
       }
-#endif
+#endif  // !FDCT32x32_HIGH_PRECISION
 
 #if FDCT32x32_HIGH_PRECISION
       if (pass == 0) {
 #endif
       // Stage 3
       {
-        step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
-        step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
-        step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
-        step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
-        step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
-        step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
-        step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
-        step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
+        step3[0] = ADD_EPI16(step2[(8 - 1)], step2[0]);
+        step3[1] = ADD_EPI16(step2[(8 - 2)], step2[1]);
+        step3[2] = ADD_EPI16(step2[(8 - 3)], step2[2]);
+        step3[3] = ADD_EPI16(step2[(8 - 4)], step2[3]);
+        step3[4] = SUB_EPI16(step2[(8 - 5)], step2[4]);
+        step3[5] = SUB_EPI16(step2[(8 - 6)], step2[5]);
+        step3[6] = SUB_EPI16(step2[(8 - 7)], step2[6]);
+        step3[7] = SUB_EPI16(step2[(8 - 8)], step2[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[0], &step3[1], &step3[2],
+                                           &step3[3], &step3[4], &step3[5],
+                                           &step3[6], &step3[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@@ -576,40 +687,79 @@ void FDCT32x32_2D(const int16_t *input,
         step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
         step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
         step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step3[10], &step3[11],
+                                           &step3[12], &step3[13]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step3[16] = _mm_add_epi16(step2[23], step1[16]);
-        step3[17] = _mm_add_epi16(step2[22], step1[17]);
-        step3[18] = _mm_add_epi16(step2[21], step1[18]);
-        step3[19] = _mm_add_epi16(step2[20], step1[19]);
-        step3[20] = _mm_sub_epi16(step1[19], step2[20]);
-        step3[21] = _mm_sub_epi16(step1[18], step2[21]);
-        step3[22] = _mm_sub_epi16(step1[17], step2[22]);
-        step3[23] = _mm_sub_epi16(step1[16], step2[23]);
-        step3[24] = _mm_sub_epi16(step1[31], step2[24]);
-        step3[25] = _mm_sub_epi16(step1[30], step2[25]);
-        step3[26] = _mm_sub_epi16(step1[29], step2[26]);
-        step3[27] = _mm_sub_epi16(step1[28], step2[27]);
-        step3[28] = _mm_add_epi16(step2[27], step1[28]);
-        step3[29] = _mm_add_epi16(step2[26], step1[29]);
-        step3[30] = _mm_add_epi16(step2[25], step1[30]);
-        step3[31] = _mm_add_epi16(step2[24], step1[31]);
+        step3[16] = ADD_EPI16(step2[23], step1[16]);
+        step3[17] = ADD_EPI16(step2[22], step1[17]);
+        step3[18] = ADD_EPI16(step2[21], step1[18]);
+        step3[19] = ADD_EPI16(step2[20], step1[19]);
+        step3[20] = SUB_EPI16(step1[19], step2[20]);
+        step3[21] = SUB_EPI16(step1[18], step2[21]);
+        step3[22] = SUB_EPI16(step1[17], step2[22]);
+        step3[23] = SUB_EPI16(step1[16], step2[23]);
+        step3[24] = SUB_EPI16(step1[31], step2[24]);
+        step3[25] = SUB_EPI16(step1[30], step2[25]);
+        step3[26] = SUB_EPI16(step1[29], step2[26]);
+        step3[27] = SUB_EPI16(step1[28], step2[27]);
+        step3[28] = ADD_EPI16(step2[27], step1[28]);
+        step3[29] = ADD_EPI16(step2[26], step1[29]);
+        step3[30] = ADD_EPI16(step2[25], step1[30]);
+        step3[31] = ADD_EPI16(step2[24], step1[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step3[16], &step3[17], &step3[18], &step3[19],
+            &step3[20], &step3[21], &step3[22], &step3[23],
+            &step3[24], &step3[25], &step3[26], &step3[27],
+            &step3[28], &step3[29], &step3[30], &step3[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
 
       // Stage 4
       {
-        step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
-        step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
-        step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
-        step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
-        step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
-        step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
-        step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
-        step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
-        step1[12] = _mm_sub_epi16(step2[15], step3[12]);
-        step1[13] = _mm_sub_epi16(step2[14], step3[13]);
-        step1[14] = _mm_add_epi16(step3[13], step2[14]);
-        step1[15] = _mm_add_epi16(step3[12], step2[15]);
+        step1[0] = ADD_EPI16(step3[ 3], step3[ 0]);
+        step1[1] = ADD_EPI16(step3[ 2], step3[ 1]);
+        step1[2] = SUB_EPI16(step3[ 1], step3[ 2]);
+        step1[3] = SUB_EPI16(step3[ 0], step3[ 3]);
+        step1[8] = ADD_EPI16(step3[11], step2[ 8]);
+        step1[9] = ADD_EPI16(step3[10], step2[ 9]);
+        step1[10] = SUB_EPI16(step2[ 9], step3[10]);
+        step1[11] = SUB_EPI16(step2[ 8], step3[11]);
+        step1[12] = SUB_EPI16(step2[15], step3[12]);
+        step1[13] = SUB_EPI16(step2[14], step3[13]);
+        step1[14] = ADD_EPI16(step3[13], step2[14]);
+        step1[15] = ADD_EPI16(step3[12], step2[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[0], &step1[1], &step1[2], &step1[3],
+            &step1[4], &step1[5], &step1[6], &step1[7],
+            &step1[8], &step1[9], &step1[10], &step1[11],
+            &step1[12], &step1[13], &step1[14], &step1[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
@@ -630,6 +780,16 @@ void FDCT32x32_2D(const int16_t *input,
         // Combine
         step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
         step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x2(&step1[5], &step1[6]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
@@ -698,13 +858,36 @@ void FDCT32x32_2D(const int16_t *input,
         step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
         step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
         step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1[18], &step1[19], &step1[20],
+                                           &step1[21], &step1[26], &step1[27],
+                                           &step1[28], &step1[29]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Stage 5
       {
-        step2[4] = _mm_add_epi16(step1[5], step3[4]);
-        step2[5] = _mm_sub_epi16(step3[4], step1[5]);
-        step2[6] = _mm_sub_epi16(step3[7], step1[6]);
-        step2[7] = _mm_add_epi16(step1[6], step3[7]);
+        step2[4] = ADD_EPI16(step1[5], step3[4]);
+        step2[5] = SUB_EPI16(step3[4], step1[5]);
+        step2[6] = SUB_EPI16(step3[7], step1[6]);
+        step2[7] = ADD_EPI16(step1[6], step3[7]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[4], &step2[5],
+                                           &step2[6], &step2[7]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
@@ -741,6 +924,17 @@ void FDCT32x32_2D(const int16_t *input,
         out[16] = _mm_packs_epi32(out_16_6, out_16_7);
         out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
         out[24] = _mm_packs_epi32(out_24_6, out_24_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                           &out[8], &out[24]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
@@ -777,24 +971,49 @@ void FDCT32x32_2D(const int16_t *input,
         step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
         step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
         step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&step2[9], &step2[10],
+                                           &step2[13], &step2[14]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step2[16] = _mm_add_epi16(step1[19], step3[16]);
-        step2[17] = _mm_add_epi16(step1[18], step3[17]);
-        step2[18] = _mm_sub_epi16(step3[17], step1[18]);
-        step2[19] = _mm_sub_epi16(step3[16], step1[19]);
-        step2[20] = _mm_sub_epi16(step3[23], step1[20]);
-        step2[21] = _mm_sub_epi16(step3[22], step1[21]);
-        step2[22] = _mm_add_epi16(step1[21], step3[22]);
-        step2[23] = _mm_add_epi16(step1[20], step3[23]);
-        step2[24] = _mm_add_epi16(step1[27], step3[24]);
-        step2[25] = _mm_add_epi16(step1[26], step3[25]);
-        step2[26] = _mm_sub_epi16(step3[25], step1[26]);
-        step2[27] = _mm_sub_epi16(step3[24], step1[27]);
-        step2[28] = _mm_sub_epi16(step3[31], step1[28]);
-        step2[29] = _mm_sub_epi16(step3[30], step1[29]);
-        step2[30] = _mm_add_epi16(step1[29], step3[30]);
-        step2[31] = _mm_add_epi16(step1[28], step3[31]);
+        step2[16] = ADD_EPI16(step1[19], step3[16]);
+        step2[17] = ADD_EPI16(step1[18], step3[17]);
+        step2[18] = SUB_EPI16(step3[17], step1[18]);
+        step2[19] = SUB_EPI16(step3[16], step1[19]);
+        step2[20] = SUB_EPI16(step3[23], step1[20]);
+        step2[21] = SUB_EPI16(step3[22], step1[21]);
+        step2[22] = ADD_EPI16(step1[21], step3[22]);
+        step2[23] = ADD_EPI16(step1[20], step3[23]);
+        step2[24] = ADD_EPI16(step1[27], step3[24]);
+        step2[25] = ADD_EPI16(step1[26], step3[25]);
+        step2[26] = SUB_EPI16(step3[25], step1[26]);
+        step2[27] = SUB_EPI16(step3[24], step1[27]);
+        step2[28] = SUB_EPI16(step3[31], step1[28]);
+        step2[29] = SUB_EPI16(step3[30], step1[29]);
+        step2[30] = ADD_EPI16(step1[29], step3[30]);
+        step2[31] = ADD_EPI16(step1[28], step3[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step2[16], &step2[17], &step2[18], &step2[19],
+            &step2[20], &step2[21], &step2[22], &step2[23],
+            &step2[24], &step2[25], &step2[26], &step2[27],
+            &step2[28], &step2[29], &step2[30], &step2[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Stage 6
       {
@@ -832,20 +1051,43 @@ void FDCT32x32_2D(const int16_t *input,
         const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
         const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
         // Combine
-        out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
+        out[4] = _mm_packs_epi32(out_04_6, out_04_7);
         out[20] = _mm_packs_epi32(out_20_6, out_20_7);
         out[12] = _mm_packs_epi32(out_12_6, out_12_7);
         out[28] = _mm_packs_epi32(out_28_6, out_28_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                           &out[12], &out[28]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
-        step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
-        step3[10] = _mm_sub_epi16(step1[11], step2[10]);
-        step3[11] = _mm_add_epi16(step2[10], step1[11]);
-        step3[12] = _mm_add_epi16(step2[13], step1[12]);
-        step3[13] = _mm_sub_epi16(step1[12], step2[13]);
-        step3[14] = _mm_sub_epi16(step1[15], step2[14]);
-        step3[15] = _mm_add_epi16(step2[14], step1[15]);
+        step3[8] = ADD_EPI16(step2[ 9], step1[ 8]);
+        step3[9] = SUB_EPI16(step1[ 8], step2[ 9]);
+        step3[10] = SUB_EPI16(step1[11], step2[10]);
+        step3[11] = ADD_EPI16(step2[10], step1[11]);
+        step3[12] = ADD_EPI16(step2[13], step1[12]);
+        step3[13] = SUB_EPI16(step1[12], step2[13]);
+        step3[14] = SUB_EPI16(step1[15], step2[14]);
+        step3[15] = ADD_EPI16(step2[14], step1[15]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[8], &step3[9], &step3[10],
+                                           &step3[11], &step3[12], &step3[13],
+                                           &step3[14], &step3[15]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
@@ -915,6 +1157,18 @@ void FDCT32x32_2D(const int16_t *input,
         step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
         step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
         step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step3[17], &step3[18], &step3[21],
+                                           &step3[22], &step3[25], &step3[26],
+                                           &step3[29], &step3[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Stage 7
       {
@@ -984,24 +1238,50 @@ void FDCT32x32_2D(const int16_t *input,
         out[22] = _mm_packs_epi32(out_22_6, out_22_7);
         out[14] = _mm_packs_epi32(out_14_6, out_14_7);
         out[30] = _mm_packs_epi32(out_30_6, out_30_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                           &out[26], &out[6], &out[22],
+                                           &out[14], &out[30]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
-        step1[16] = _mm_add_epi16(step3[17], step2[16]);
-        step1[17] = _mm_sub_epi16(step2[16], step3[17]);
-        step1[18] = _mm_sub_epi16(step2[19], step3[18]);
-        step1[19] = _mm_add_epi16(step3[18], step2[19]);
-        step1[20] = _mm_add_epi16(step3[21], step2[20]);
-        step1[21] = _mm_sub_epi16(step2[20], step3[21]);
-        step1[22] = _mm_sub_epi16(step2[23], step3[22]);
-        step1[23] = _mm_add_epi16(step3[22], step2[23]);
-        step1[24] = _mm_add_epi16(step3[25], step2[24]);
-        step1[25] = _mm_sub_epi16(step2[24], step3[25]);
-        step1[26] = _mm_sub_epi16(step2[27], step3[26]);
-        step1[27] = _mm_add_epi16(step3[26], step2[27]);
-        step1[28] = _mm_add_epi16(step3[29], step2[28]);
-        step1[29] = _mm_sub_epi16(step2[28], step3[29]);
-        step1[30] = _mm_sub_epi16(step2[31], step3[30]);
-        step1[31] = _mm_add_epi16(step3[30], step2[31]);
+        step1[16] = ADD_EPI16(step3[17], step2[16]);
+        step1[17] = SUB_EPI16(step2[16], step3[17]);
+        step1[18] = SUB_EPI16(step2[19], step3[18]);
+        step1[19] = ADD_EPI16(step3[18], step2[19]);
+        step1[20] = ADD_EPI16(step3[21], step2[20]);
+        step1[21] = SUB_EPI16(step2[20], step3[21]);
+        step1[22] = SUB_EPI16(step2[23], step3[22]);
+        step1[23] = ADD_EPI16(step3[22], step2[23]);
+        step1[24] = ADD_EPI16(step3[25], step2[24]);
+        step1[25] = SUB_EPI16(step2[24], step3[25]);
+        step1[26] = SUB_EPI16(step2[27], step3[26]);
+        step1[27] = ADD_EPI16(step3[26], step2[27]);
+        step1[28] = ADD_EPI16(step3[29], step2[28]);
+        step1[29] = SUB_EPI16(step2[28], step3[29]);
+        step1[30] = SUB_EPI16(step2[31], step3[30]);
+        step1[31] = ADD_EPI16(step3[30], step2[31]);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x16(
+            &step1[16], &step1[17], &step1[18], &step1[19],
+            &step1[20], &step1[21], &step1[22], &step1[23],
+            &step1[24], &step1[25], &step1[26], &step1[27],
+            &step1[28], &step1[29], &step1[30], &step1[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+             HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       // Final stage --- outputs indices are bit-reversed.
       {
@@ -1071,6 +1351,18 @@ void FDCT32x32_2D(const int16_t *input,
         out[23] = _mm_packs_epi32(out_23_6, out_23_7);
         out[15] = _mm_packs_epi32(out_15_6, out_15_7);
         out[31] = _mm_packs_epi32(out_31_6, out_31_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                           &out[25], &out[7], &out[23],
+                                           &out[15], &out[31]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
       {
         const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
@@ -1139,6 +1431,18 @@ void FDCT32x32_2D(const int16_t *input,
         out[19] = _mm_packs_epi32(out_19_6, out_19_7);
         out[11] = _mm_packs_epi32(out_11_6, out_11_7);
         out[27] = _mm_packs_epi32(out_27_6, out_27_7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                           &out[29], &out[3], &out[19],
+                                           &out[11], &out[27]);
+        if (overflow) {
+          if (pass == 0)
+            HIGH_FDCT32x32_2D_C(input, output_org, stride);
+          else
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
       }
 #if FDCT32x32_HIGH_PRECISION
       } else {
@@ -1390,15 +1694,22 @@ void FDCT32x32_2D(const int16_t *input,
 
         // TODO(jingning): manually inline k_madd_epi32_ to further hide
         // instruction latency.
-        v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
-        v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
-        v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
-        v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
-        v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
-        v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
-        v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
-        v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
-
+        v[0] = k_madd_epi32(u[0], k32_p16_m16);
+        v[1] = k_madd_epi32(u[1], k32_p16_m16);
+        v[2] = k_madd_epi32(u[2], k32_p16_m16);
+        v[3] = k_madd_epi32(u[3], k32_p16_m16);
+        v[4] = k_madd_epi32(u[0], k32_p16_p16);
+        v[5] = k_madd_epi32(u[1], k32_p16_p16);
+        v[6] = k_madd_epi32(u[2], k32_p16_p16);
+        v[7] = k_madd_epi32(u[3], k32_p16_p16);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = k_check_epi32_overflow_8(&v[0], &v[1], &v[2], &v[3],
+                                            &v[4], &v[5], &v[6], &v[7], &kZero);
+        if (overflow) {
+          HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
         u[0] = k_packs_epi64(v[0], v[1]);
         u[1] = k_packs_epi64(v[2], v[3]);
         u[2] = k_packs_epi64(v[4], v[5]);
@@ -1469,6 +1780,18 @@ void FDCT32x32_2D(const int16_t *input,
           v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
           v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -1565,6 +1888,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = k_madd_epi32(u[6], k32_m08_p24);
           v[15] = k_madd_epi32(u[7], k32_m08_p24);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[0] = k_packs_epi64(v[0], v[1]);
           u[1] = k_packs_epi64(v[2], v[3]);
           u[2] = k_packs_epi64(v[4], v[5]);
@@ -1633,6 +1966,14 @@ void FDCT32x32_2D(const int16_t *input,
           out[16] = _mm_packs_epi32(u[2], u[3]);
           out[ 8] = _mm_packs_epi32(u[4], u[5]);
           out[24] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[0], &out[16],
+                                             &out[8], &out[24]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
@@ -1665,6 +2006,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = k_madd_epi32(u[2], k32_p24_p08);
           v[15] = k_madd_epi32(u[3], k32_p24_p08);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[0] = k_packs_epi64(v[0], v[1]);
           u[1] = k_packs_epi64(v[2], v[3]);
           u[2] = k_packs_epi64(v[4], v[5]);
@@ -1767,6 +2118,16 @@ void FDCT32x32_2D(const int16_t *input,
           v[14] = k_madd_epi32(u[14], k32_m04_p28);
           v[15] = k_madd_epi32(u[15], k32_m04_p28);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_16(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[0] = k_packs_epi64(v[0], v[1]);
           u[1] = k_packs_epi64(v[2], v[3]);
           u[2] = k_packs_epi64(v[4], v[5]);
@@ -1834,6 +2195,14 @@ void FDCT32x32_2D(const int16_t *input,
           out[20] = _mm_packs_epi32(u[2], u[3]);
           out[12] = _mm_packs_epi32(u[4], u[5]);
           out[28] = _mm_packs_epi32(u[6], u[7]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&out[4], &out[20],
+                                             &out[12], &out[28]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
@@ -1912,6 +2281,18 @@ void FDCT32x32_2D(const int16_t *input,
           v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
           v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2024,6 +2405,18 @@ void FDCT32x32_2D(const int16_t *input,
           v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
           v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2151,6 +2544,15 @@ void FDCT32x32_2D(const int16_t *input,
           out[22] = _mm_packs_epi32(u[10], u[11]);
           out[14] = _mm_packs_epi32(u[12], u[13]);
           out[30] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[2], &out[18], &out[10],
+                                             &out[26], &out[6], &out[22],
+                                             &out[14], &out[30]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
@@ -2247,6 +2649,18 @@ void FDCT32x32_2D(const int16_t *input,
           v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
           v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2374,6 +2788,15 @@ void FDCT32x32_2D(const int16_t *input,
           out[23] = _mm_packs_epi32(u[10], u[11]);
           out[15] = _mm_packs_epi32(u[12], u[13]);
           out[31] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[1], &out[17], &out[9],
+                                             &out[25], &out[7], &out[23],
+                                             &out[15], &out[31]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
         {
           const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
@@ -2435,6 +2858,18 @@ void FDCT32x32_2D(const int16_t *input,
           v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
           v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
 
+#if DCT_HIGH_BIT_DEPTH
+          overflow = k_check_epi32_overflow_32(
+              &v[0], &v[1], &v[2], &v[3], &v[4], &v[5], &v[6], &v[7],
+              &v[8], &v[9], &v[10], &v[11], &v[12], &v[13], &v[14], &v[15],
+              &v[16], &v[17], &v[18], &v[19], &v[20], &v[21], &v[22], &v[23],
+              &v[24], &v[25], &v[26], &v[27], &v[28], &v[29], &v[30], &v[31],
+              &kZero);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
           u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
           u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
           u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
@@ -2562,18 +2997,23 @@ void FDCT32x32_2D(const int16_t *input,
           out[19] = _mm_packs_epi32(u[10], u[11]);
           out[11] = _mm_packs_epi32(u[12], u[13]);
           out[27] = _mm_packs_epi32(u[14], u[15]);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&out[5], &out[21], &out[13],
+                                             &out[29], &out[3], &out[19],
+                                             &out[11], &out[27]);
+          if (overflow) {
+            HIGH_FDCT32x32_2D_ROWS_C(intermediate, output_org);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
         }
       }
-#endif
+#endif  // FDCT32x32_HIGH_PRECISION
       // Transpose the results, do it as four 8x8 transposes.
       {
         int transpose_block;
-        int16_t *output;
-        if (0 == pass) {
-          output = &intermediate[column_start * 32];
-        } else {
-          output = &output_org[column_start * 32];
-        }
+        int16_t *output0 = &intermediate[column_start * 32];
+        tran_low_t *output1 = &output_org[column_start * 32];
         for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
           __m128i *this_out = &out[8 * transpose_block];
           // 00 01 02 03 04 05 06 07
@@ -2674,18 +3114,36 @@ void FDCT32x32_2D(const int16_t *input,
           }
           // Note: even though all these stores are aligned, using the aligned
           //       intrinsic make the code slightly slower.
-          _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
-          _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
-          _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
-          _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
-          _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
-          _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
-          _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
-          _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
-          // Process next 8x8
-          output += 8;
+          if (pass == 0) {
+            _mm_storeu_si128((__m128i *)(output0 + 0 * 32), tr2_0);
+            _mm_storeu_si128((__m128i *)(output0 + 1 * 32), tr2_1);
+            _mm_storeu_si128((__m128i *)(output0 + 2 * 32), tr2_2);
+            _mm_storeu_si128((__m128i *)(output0 + 3 * 32), tr2_3);
+            _mm_storeu_si128((__m128i *)(output0 + 4 * 32), tr2_4);
+            _mm_storeu_si128((__m128i *)(output0 + 5 * 32), tr2_5);
+            _mm_storeu_si128((__m128i *)(output0 + 6 * 32), tr2_6);
+            _mm_storeu_si128((__m128i *)(output0 + 7 * 32), tr2_7);
+            // Process next 8x8
+            output0 += 8;
+          } else {
+            storeu_output(&tr2_0, (output1 + 0 * 32));
+            storeu_output(&tr2_1, (output1 + 1 * 32));
+            storeu_output(&tr2_2, (output1 + 2 * 32));
+            storeu_output(&tr2_3, (output1 + 3 * 32));
+            storeu_output(&tr2_4, (output1 + 4 * 32));
+            storeu_output(&tr2_5, (output1 + 5 * 32));
+            storeu_output(&tr2_6, (output1 + 6 * 32));
+            storeu_output(&tr2_7, (output1 + 7 * 32));
+            // Process next 8x8
+            output1 += 8;
+          }
         }
       }
     }
   }
 }  // NOLINT
+
+#undef ADD_EPI16
+#undef SUB_EPI16
+#undef HIGH_FDCT32x32_2D_C
+#undef HIGH_FDCT32x32_2D_ROWS_C
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c
new file mode 100644
index 00000000000..e03a76d2e89
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_impl_sse2.c
@@ -0,0 +1,1022 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vp9/encoder/vp9_dct.h"
+#include "vp9/encoder/x86/vp9_dct_sse2.h"
+#include "vpx_ports/mem.h"
+
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64,
+                                            cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64,
+                                            cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
+                                            cospi_8_64, cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64,
+                                            -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
+                                            cospi_24_64, -cospi_8_64,
+                                            -cospi_24_64, cospi_8_64,
+                                            -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
+                                               +(DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 =  DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i in0, in1;
+#if DCT_HIGH_BIT_DEPTH
+  __m128i cmp0, cmp1;
+  int test, overflow;
+#endif
+
+  // Load inputs.
+  in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+  in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+  in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  2 * stride)));
+  in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+                                                 (input +  3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+#if DCT_HIGH_BIT_DEPTH
+  // Check inputs small enough to use optimised code
+  cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+  cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
+                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+  test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
+  if (test) {
+    vp9_highbd_fdct4x4_c(input, output, stride);
+    return;
+  }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+  // multiply by 16 to give some extra precision
+  in0 = _mm_slli_epi16(in0, 4);
+  in1 = _mm_slli_epi16(in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+    in0 = _mm_add_epi16(in0, mask);
+    in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
+    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&x0, &x1);
+    if (overflow) {
+      vp9_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    in0 = _mm_shuffle_epi32(x0, 0xD8);
+    in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(in0, in1);
+    const __m128i t1 = SUB_EPI16(in0, in1);
+    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
+    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
+#if DCT_HIGH_BIT_DEPTH
+    overflow = check_epi16_overflow_x2(&t0, &t1);
+    if (overflow) {
+      vp9_highbd_fdct4x4_c(input, output, stride);
+      return;
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      // w0 = [o0 o4 o8 oC]
+      // w1 = [o2 o6 oA oE]
+      // w2 = [o1 o5 o9 oD]
+      // w3 = [o3 o7 oB oF]
+      // remember the o's are numbered according to the correct output location
+      const __m128i x0 = _mm_packs_epi32(w0, w1);
+      const __m128i x1 = _mm_packs_epi32(w2, w3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&x0, &x1);
+      if (overflow) {
+        vp9_highbd_fdct4x4_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+        // y1 = [o2 o3 o6 o7 oA oB oE oF]
+        in0 = _mm_unpacklo_epi32(y0, y1);
+        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+        in1 = _mm_unpackhi_epi32(y0, y1);
+        // in1 = [o8 o9 oA oB oC oD oE oF]
+      }
+    }
+  }
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = ADD_EPI16(in0, in7);
+    const __m128i q1 = ADD_EPI16(in1, in6);
+    const __m128i q2 = ADD_EPI16(in2, in5);
+    const __m128i q3 = ADD_EPI16(in3, in4);
+    const __m128i q4 = SUB_EPI16(in3, in4);
+    const __m128i q5 = SUB_EPI16(in2, in5);
+    const __m128i q6 = SUB_EPI16(in1, in6);
+    const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+    if (pass == 1) {
+      overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                         &q4, &q5, &q6, &q7);
+      if (overflow) {
+        vp9_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = ADD_EPI16(q0, q3);
+      const __m128i r1 = ADD_EPI16(q1, q2);
+      const __m128i r2 = SUB_EPI16(q1, q2);
+      const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+      if (overflow) {
+        vp9_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      // Interleave to do the multiply by constants which gets us into 32bits
+      {
+        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+        // Combine
+        res0 = _mm_packs_epi32(w0, w1);
+        res4 = _mm_packs_epi32(w2, w3);
+        res2 = _mm_packs_epi32(w4, w5);
+        res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+        if (overflow) {
+          vp9_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&r0, &r1);
+      if (overflow) {
+        vp9_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // Add/subtract
+        const __m128i x0 = ADD_EPI16(q4, r0);
+        const __m128i x1 = SUB_EPI16(q4, r0);
+        const __m128i x2 = SUB_EPI16(q7, r1);
+        const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+        if (overflow) {
+          vp9_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Interleave to do the multiply by constants which gets us into 32bits
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res1 = _mm_packs_epi32(w0, w1);
+          res7 = _mm_packs_epi32(w2, w3);
+          res5 = _mm_packs_epi32(w4, w5);
+          res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+          if (overflow) {
+            vp9_highbd_fdct8x8_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
+  }
+}
+
+void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we transpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
+  const int16_t *in = input;
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+#if DCT_HIGH_BIT_DEPTH
+    int overflow;
+#endif
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = ADD_EPI16(in00, in15);
+        input1 = ADD_EPI16(in01, in14);
+        input2 = ADD_EPI16(in02, in13);
+        input3 = ADD_EPI16(in03, in12);
+        input4 = ADD_EPI16(in04, in11);
+        input5 = ADD_EPI16(in05, in10);
+        input6 = ADD_EPI16(in06, in09);
+        input7 = ADD_EPI16(in07, in08);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
+                                           &input4, &input5, &input6, &input7);
+        if (overflow) {
+          vp9_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = SUB_EPI16(in07, in08);
+        step1_1 = SUB_EPI16(in06, in09);
+        step1_2 = SUB_EPI16(in05, in10);
+        step1_3 = SUB_EPI16(in04, in11);
+        step1_4 = SUB_EPI16(in03, in12);
+        step1_5 = SUB_EPI16(in02, in13);
+        step1_6 = SUB_EPI16(in01, in14);
+        step1_7 = SUB_EPI16(in00, in15);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                           &step1_2, &step1_3,
+                                           &step1_4, &step1_5,
+                                           &step1_6, &step1_7);
+        if (overflow) {
+          vp9_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+      // Work on the first eight values; fdct8(input, even_results);
+      {
+        // Add/subtract
+        const __m128i q0 = ADD_EPI16(input0, input7);
+        const __m128i q1 = ADD_EPI16(input1, input6);
+        const __m128i q2 = ADD_EPI16(input2, input5);
+        const __m128i q3 = ADD_EPI16(input3, input4);
+        const __m128i q4 = SUB_EPI16(input3, input4);
+        const __m128i q5 = SUB_EPI16(input2, input5);
+        const __m128i q6 = SUB_EPI16(input1, input6);
+        const __m128i q7 = SUB_EPI16(input0, input7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x8(&q0, &q1, &q2, &q3,
+                                           &q4, &q5, &q6, &q7);
+        if (overflow) {
+          vp9_highbd_fdct16x16_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Work on first four results
+        {
+          // Add/subtract
+          const __m128i r0 = ADD_EPI16(q0, q3);
+          const __m128i r1 = ADD_EPI16(q1, q2);
+          const __m128i r2 = SUB_EPI16(q1, q2);
+          const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          {
+            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
+            if (overflow) {
+              vp9_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+          }
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i r0 = mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+          const __m128i r1 = mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
+                                              &k__DCT_CONST_ROUNDING,
+                                              DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x2(&r0, &r1);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+          {
+            // Add/subtract
+            const __m128i x0 = ADD_EPI16(q4, r0);
+            const __m128i x1 = SUB_EPI16(q4, r0);
+            const __m128i x2 = SUB_EPI16(q7, r1);
+            const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+            if (overflow) {
+              vp9_highbd_fdct16x16_c(input, output, stride);
+              return;
+            }
+#endif  // DCT_HIGH_BIT_DEPTH
+            // Interleave to do the multiply by constants which gets us
+            // into 32 bits.
+            {
+              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+              overflow = check_epi16_overflow_x4(&res02, &res14,
+                                                 &res10, &res06);
+              if (overflow) {
+                vp9_highbd_fdct16x16_c(input, output, stride);
+                return;
+              }
+#endif  // DCT_HIGH_BIT_DEPTH
+            }
+          }
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5,
+                                             &step2_4);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 3
+        {
+          step3_0 = ADD_EPI16(step1_0, step2_3);
+          step3_1 = ADD_EPI16(step1_1, step2_2);
+          step3_2 = SUB_EPI16(step1_1, step2_2);
+          step3_3 = SUB_EPI16(step1_0, step2_3);
+          step3_4 = SUB_EPI16(step1_7, step2_4);
+          step3_5 = SUB_EPI16(step1_6, step2_5);
+          step3_6 = ADD_EPI16(step1_6, step2_5);
+          step3_7 = ADD_EPI16(step1_7, step2_4);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step3_0, &step3_1,
+                                             &step3_2, &step3_3,
+                                             &step3_4, &step3_5,
+                                             &step3_6, &step3_7);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6,
+                                             &step2_5);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 5
+        {
+          step1_0 = ADD_EPI16(step3_0, step2_1);
+          step1_1 = SUB_EPI16(step3_0, step2_1);
+          step1_2 = ADD_EPI16(step3_3, step2_2);
+          step1_3 = SUB_EPI16(step3_3, step2_2);
+          step1_4 = SUB_EPI16(step3_4, step2_5);
+          step1_5 = ADD_EPI16(step3_4, step2_5);
+          step1_6 = SUB_EPI16(step3_7, step2_6);
+          step1_7 = ADD_EPI16(step3_7, step2_6);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x8(&step1_0, &step1_1,
+                                             &step1_2, &step1_3,
+                                             &step1_4, &step1_5,
+                                             &step1_6, &step1_7);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
+          if (overflow) {
+            vp9_highbd_fdct16x16_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      transpose_and_output8x8(&res00, &res01, &res02, &res03,
+                              &res04, &res05, &res06, &res07,
+                              pass, out0, out1);
+      transpose_and_output8x8(&res08, &res09, &res10, &res11,
+                              &res12, &res13, &res14, &res15,
+                              pass, out0 + 8, out1 + 8);
+      if (pass == 0) {
+        out0 += 8*16;
+      } else {
+        out1 += 8*16;
+      }
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+  }
+}
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
index f71181c5e91..b41fbc8b3bb 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_mmx.asm
@@ -62,9 +62,40 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
   psllw           m2,        2
   psllw           m3,        2
 
+%if CONFIG_VP9_HIGHBITDEPTH
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m0
+  pcmpgtw         m5,             m1
+  movq            m6,             m0
+  movq            m7,             m1
+  punpcklwd       m0,             m4
+  punpcklwd       m1,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq],      m0
+  movq            [outputq + 8],  m6
+  movq            [outputq + 16], m1
+  movq            [outputq + 24], m7
+  pxor            m4,             m4
+  pxor            m5,             m5
+  pcmpgtw         m4,             m2
+  pcmpgtw         m5,             m3
+  movq            m6,             m2
+  movq            m7,             m3
+  punpcklwd       m2,             m4
+  punpcklwd       m3,             m5
+  punpckhwd       m6,             m4
+  punpckhwd       m7,             m5
+  movq            [outputq + 32], m2
+  movq            [outputq + 40], m6
+  movq            [outputq + 48], m3
+  movq            [outputq + 56], m7
+%else
   movq            [outputq],      m0
   movq            [outputq + 8],  m1
   movq            [outputq + 16], m2
   movq            [outputq + 24], m3
+%endif
 
   RET
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index e799951c2b3..564b7955e5b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -8,13 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vp9/encoder/vp9_dct.h"
+#include "vp9/encoder/x86/vp9_dct_sse2.h"
 #include "vpx_ports/mem.h"
 
-#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-
-void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   __m128i in0, in1;
   __m128i tmp;
   const __m128i zero = _mm_setzero_si128();
@@ -40,209 +41,9 @@ void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
 
   in1 = _mm_add_epi32(tmp, in0);
   in0 = _mm_slli_epi32(in1, 1);
-  _mm_store_si128((__m128i *)(output), in0);
-}
-
-void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
-  // This 2D transform implements 4 vertical 1D transforms followed
-  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
-  // by Chen, Smith and Fralick ('77).  The commands for moving the data
-  // around have been minimized by hand.
-  // For the purposes of the comments, the 16 inputs are referred to at i0
-  // through iF (in raster order), intermediate variables are a0, b0, c0
-  // through f, and correspond to the in-place computations mapped to input
-  // locations.  The outputs, o0 through oF are labeled according to the
-  // output locations.
-
-  // Constants
-  // These are the coefficients used for the multiplies.
-  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
-  // where cospi_N_64 = cos(N pi /64)
-  const __m128i k__cospi_A = _mm_setr_epi16(cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_B = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_C = _mm_setr_epi16(cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_D = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_E = _mm_setr_epi16(cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64,
-                                            cospi_16_64, cospi_16_64);
-  const __m128i k__cospi_F = _mm_setr_epi16(cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64,
-                                            cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_G = _mm_setr_epi16(cospi_8_64, cospi_24_64,
-                                            cospi_8_64, cospi_24_64,
-                                            -cospi_8_64, -cospi_24_64,
-                                            -cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_H = _mm_setr_epi16(cospi_24_64, -cospi_8_64,
-                                            cospi_24_64, -cospi_8_64,
-                                            -cospi_24_64, cospi_8_64,
-                                            -cospi_24_64, cospi_8_64);
-
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // This second rounding constant saves doing some extra adds at the end
-  const __m128i k__DCT_CONST_ROUNDING2 = _mm_set1_epi32(DCT_CONST_ROUNDING
-                                               +(DCT_CONST_ROUNDING << 1));
-  const int DCT_CONST_BITS2 =  DCT_CONST_BITS+2;
-  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
-  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
-  __m128i in0, in1;
-
-  // Load inputs.
-  {
-    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
-    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-    in1  = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
-           (input +  2 * stride)));
-    in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
-           (input +  3 * stride)));
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-
-
-    // multiply by 16 to give some extra precision
-    in0 = _mm_slli_epi16(in0, 4);
-    in1 = _mm_slli_epi16(in1, 4);
-    // if (i == 0 && input[0]) input[0] += 1;
-    // add 1 to the upper left pixel if it is non-zero, which helps reduce
-    // the round-trip error
-    {
-      // The mask will only contain whether the first value is zero, all
-      // other comparison will fail as something shifted by 4 (above << 4)
-      // can never be equal to one. To increment in the non-zero case, we
-      // add the mask and one for the first element:
-      //   - if zero, mask = -1, v = v - 1 + 1 = v
-      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
-      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
-      in0 = _mm_add_epi16(in0, mask);
-      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
-    }
-  }
-  // There are 4 total stages, alternating between an add/subtract stage
-  // followed by an multiply-and-add stage.
-  {
-    // Stage 1: Add/subtract
-
-    // in0 = [i0 i1 i2 i3 iC iD iE iF]
-    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
-    const __m128i r0 = _mm_unpacklo_epi16(in0, in1);
-    const __m128i r1 = _mm_unpackhi_epi16(in0, in1);
-    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
-    // r1 = [iC i8 iD i9 iE iA iF iB]
-    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
-    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
-    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
-    // r3 = [iC i8 iD i9 iF iB iE iA]
-
-    const __m128i t0 = _mm_add_epi16(r2, r3);
-    const __m128i t1 = _mm_sub_epi16(r2, r3);
-    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
-    // t1 = [aC a8 aD a9 aF aB aE aA]
-
-    // Stage 2: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
-    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
-    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
-    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
-    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
-    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
-    // Then add and right-shift to get back to 16-bit range
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-    // w0 = [b0 b1 b7 b6]
-    // w1 = [b8 b9 bF bE]
-    // w2 = [b4 b5 b3 b2]
-    // w3 = [bC bD bB bA]
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
-    // x1 = [b4 b5 b3 b2 bC bD bB bA]
-    in0 = _mm_shuffle_epi32(x0, 0xD8);
-    in1 = _mm_shuffle_epi32(x1, 0x8D);
-    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
-    // in1 = [b3 b2 bB bA b4 b5 bC bD]
-  }
-  {
-    // vertical DCTs finished. Now we do the horizontal DCTs.
-    // Stage 3: Add/subtract
-
-    const __m128i t0 = _mm_add_epi16(in0, in1);
-    const __m128i t1 = _mm_sub_epi16(in0, in1);
-    // t0 = [c0 c1 c8 c9  c4  c5  cC  cD]
-    // t1 = [c3 c2 cB cA -c7 -c6 -cF -cE]
-
-    // Stage 4: multiply by constants (which gets us into 32 bits).
-    // The constants needed here are:
-    // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
-    // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
-    // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
-    // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
-    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
-    const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
-    const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
-    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
-    // Then add and right-shift to get back to 16-bit range
-    // but this combines the final right-shift as well to save operations
-    // This unusual rounding operations is to maintain bit-accurate
-    // compatibility with the c version of this function which has two
-    // rounding steps in a row.
-    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
-    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
-    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
-    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
-    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
-    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
-    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
-    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
-    // w0 = [o0 o4 o8 oC]
-    // w1 = [o2 o6 oA oE]
-    // w2 = [o1 o5 o9 oD]
-    // w3 = [o3 o7 oB oF]
-    // remember the o's are numbered according to the correct output location
-    const __m128i x0 = _mm_packs_epi32(w0, w1);
-    const __m128i x1 = _mm_packs_epi32(w2, w3);
-    // x0 = [o0 o4 o8 oC o2 o6 oA oE]
-    // x1 = [o1 o5 o9 oD o3 o7 oB oF]
-    const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
-    const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
-    // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
-    // y1 = [o2 o3 o6 o7 oA oB oE oF]
-    in0 = _mm_unpacklo_epi32(y0, y1);
-    // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
-    in1 = _mm_unpackhi_epi32(y0, y1);
-    // in1 = [o8 o9 oA oB oC oD oE oF]
-  }
-  // Post-condition (v + 1) >> 2 is now incorporated into previous
-  // add and right-shift commands.  Only 2 store instructions needed
-  // because we are using the fact that 1/3 are stored just after 0/2.
-  {
-     _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
-     _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
-  }
+  store_output(&in0, output);
 }
 
-
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
                                    int stride) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
@@ -264,7 +65,7 @@ static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
 }
 
-static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
   const __m128i kOne = _mm_set1_epi16(1);
   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
@@ -272,8 +73,8 @@ static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
   __m128i out23 = _mm_add_epi16(in23, kOne);
   out01 = _mm_srai_epi16(out01, 2);
   out23 = _mm_srai_epi16(out23, 2);
-  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
-  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+  store_output(&out01, (output + 0 * 8));
+  store_output(&out23, (output + 1 * 8));
 }
 
 static INLINE void transpose_4x4(__m128i *res) {
@@ -296,7 +97,7 @@ static INLINE void transpose_4x4(__m128i *res) {
 }
 
 void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -333,7 +134,7 @@ void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
   const __m128i kZero = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
@@ -376,7 +177,7 @@ void fadst4_sse2(__m128i *in) {
   transpose_4x4(in);
 }
 
-void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
+void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[4];
 
@@ -408,7 +209,7 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
   }
 }
 
-void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
   __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
   __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
   __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
@@ -445,16 +246,25 @@ void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
   in0 = _mm_srli_si128(sum, 8);
 
   in1 = _mm_add_epi32(sum, in0);
-  _mm_store_si128((__m128i *)(output), in1);
+  store_output(&in1, output);
 }
 
-void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
+                            int16_t* coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t* zbin_ptr,
+                            const int16_t* round_ptr, const int16_t* quant_ptr,
+                            const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                            int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                            uint16_t* eob_ptr,
+                            const int16_t* scan_ptr,
+                            const int16_t* iscan_ptr) {
+  __m128i zero;
   int pass;
   // Constants
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -472,6 +282,14 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
   __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
   __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
   __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
   // Pre-condition input (shift by two)
   in0 = _mm_slli_epi16(in0, 2);
   in1 = _mm_slli_epi16(in1, 2);
@@ -482,6 +300,15 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
   in6 = _mm_slli_epi16(in6, 2);
   in7 = _mm_slli_epi16(in7, 2);
 
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
   // We do two passes, first the columns, then the rows. The results of the
   // first pass are transposed so that the same column code can be reused. The
   // results of the second pass are also transposed so that the rows (processed
@@ -692,15 +519,175 @@ void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
     in5 = _mm_srai_epi16(in5, 1);
     in6 = _mm_srai_epi16(in6, 1);
     in7 = _mm_srai_epi16(in7, 1);
-    // store results
-    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
-    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
-    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
-    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
-    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
-    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
-    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
-    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
   }
 }
 
@@ -727,9 +714,7 @@ static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
 }
 
 // right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, int const bit) {
-  const __m128i kOne = _mm_set1_epi16(1);
-  const int bit_m02 = bit - 2;
+static INLINE void right_shift_8x8(__m128i *res, const int bit) {
   __m128i sign0 = _mm_srai_epi16(res[0], 15);
   __m128i sign1 = _mm_srai_epi16(res[1], 15);
   __m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -739,16 +724,16 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) {
   __m128i sign6 = _mm_srai_epi16(res[6], 15);
   __m128i sign7 = _mm_srai_epi16(res[7], 15);
 
-  if (bit_m02 >= 0) {
-    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
-    res[0] = _mm_add_epi16(res[0], k_const_rounding);
-    res[1] = _mm_add_epi16(res[1], k_const_rounding);
-    res[2] = _mm_add_epi16(res[2], k_const_rounding);
-    res[3] = _mm_add_epi16(res[3], k_const_rounding);
-    res[4] = _mm_add_epi16(res[4], k_const_rounding);
-    res[5] = _mm_add_epi16(res[5], k_const_rounding);
-    res[6] = _mm_add_epi16(res[6], k_const_rounding);
-    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  if (bit == 2) {
+    const __m128i const_rounding = _mm_set1_epi16(1);
+    res[0] = _mm_add_epi16(res[0], const_rounding);
+    res[1] = _mm_add_epi16(res[1], const_rounding);
+    res[2] = _mm_add_epi16(res[2], const_rounding);
+    res[3] = _mm_add_epi16(res[3], const_rounding);
+    res[4] = _mm_add_epi16(res[4], const_rounding);
+    res[5] = _mm_add_epi16(res[5], const_rounding);
+    res[6] = _mm_add_epi16(res[6], const_rounding);
+    res[7] = _mm_add_epi16(res[7], const_rounding);
   }
 
   res[0] = _mm_sub_epi16(res[0], sign0);
@@ -760,31 +745,95 @@ static INLINE void right_shift_8x8(__m128i *res, int const bit) {
   res[6] = _mm_sub_epi16(res[6], sign6);
   res[7] = _mm_sub_epi16(res[7], sign7);
 
-  res[0] = _mm_srai_epi16(res[0], bit);
-  res[1] = _mm_srai_epi16(res[1], bit);
-  res[2] = _mm_srai_epi16(res[2], bit);
-  res[3] = _mm_srai_epi16(res[3], bit);
-  res[4] = _mm_srai_epi16(res[4], bit);
-  res[5] = _mm_srai_epi16(res[5], bit);
-  res[6] = _mm_srai_epi16(res[6], bit);
-  res[7] = _mm_srai_epi16(res[7], bit);
+  if (bit == 1) {
+    res[0] = _mm_srai_epi16(res[0], 1);
+    res[1] = _mm_srai_epi16(res[1], 1);
+    res[2] = _mm_srai_epi16(res[2], 1);
+    res[3] = _mm_srai_epi16(res[3], 1);
+    res[4] = _mm_srai_epi16(res[4], 1);
+    res[5] = _mm_srai_epi16(res[5], 1);
+    res[6] = _mm_srai_epi16(res[6], 1);
+    res[7] = _mm_srai_epi16(res[7], 1);
+  } else {
+    res[0] = _mm_srai_epi16(res[0], 2);
+    res[1] = _mm_srai_epi16(res[1], 2);
+    res[2] = _mm_srai_epi16(res[2], 2);
+    res[3] = _mm_srai_epi16(res[3], 2);
+    res[4] = _mm_srai_epi16(res[4], 2);
+    res[5] = _mm_srai_epi16(res[5], 2);
+    res[6] = _mm_srai_epi16(res[6], 2);
+    res[7] = _mm_srai_epi16(res[7], 2);
+  }
 }
 
 // write 8x8 array
-static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
-  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
-  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
-  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
-  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
-  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
-  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
-  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
-  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
+                                    int stride) {
+  store_output(&res[0], (output + 0 * stride));
+  store_output(&res[1], (output + 1 * stride));
+  store_output(&res[2], (output + 2 * stride));
+  store_output(&res[3], (output + 3 * stride));
+  store_output(&res[4], (output + 4 * stride));
+  store_output(&res[5], (output + 5 * stride));
+  store_output(&res[6], (output + 6 * stride));
+  store_output(&res[7], (output + 7 * stride));
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
 }
 
 void fdct8_sse2(__m128i *in) {
   // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -936,7 +985,7 @@ void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -1152,7 +1201,7 @@ void fadst8_sse2(__m128i *in) {
   array_transpose_8x8(in, in);
 }
 
-void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
+void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[8];
 
@@ -1187,7 +1236,8 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
   }
 }
 
-void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
   __m128i in0, in1, in2, in3;
   __m128i u0, u1;
   __m128i sum = _mm_setzero_si128();
@@ -1252,632 +1302,7 @@ void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 1);
-  _mm_store_si128((__m128i *)(output), in1);
-}
-
-void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
-  const int16_t *in = input;
-  int16_t *out = intermediate;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
-        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
-        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
-        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
-        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
-        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
-        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
-        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
-        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
-        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
-        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = _mm_add_epi16(in00, in15);
-        input1 = _mm_add_epi16(in01, in14);
-        input2 = _mm_add_epi16(in02, in13);
-        input3 = _mm_add_epi16(in03, in12);
-        input4 = _mm_add_epi16(in04, in11);
-        input5 = _mm_add_epi16(in05, in10);
-        input6 = _mm_add_epi16(in06, in09);
-        input7 = _mm_add_epi16(in07, in08);
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = _mm_sub_epi16(in07, in08);
-        step1_1 = _mm_sub_epi16(in06, in09);
-        step1_2 = _mm_sub_epi16(in05, in10);
-        step1_3 = _mm_sub_epi16(in04, in11);
-        step1_4 = _mm_sub_epi16(in03, in12);
-        step1_5 = _mm_sub_epi16(in02, in13);
-        step1_6 = _mm_sub_epi16(in01, in14);
-        step1_7 = _mm_sub_epi16(in00, in15);
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = _mm_add_epi16(input0, input7);
-        const __m128i q1 = _mm_add_epi16(input1, input6);
-        const __m128i q2 = _mm_add_epi16(input2, input5);
-        const __m128i q3 = _mm_add_epi16(input3, input4);
-        const __m128i q4 = _mm_sub_epi16(input3, input4);
-        const __m128i q5 = _mm_sub_epi16(input2, input5);
-        const __m128i q6 = _mm_sub_epi16(input1, input6);
-        const __m128i q7 = _mm_sub_epi16(input0, input7);
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = _mm_add_epi16(q0, q3);
-          const __m128i r1 = _mm_add_epi16(q1, q2);
-          const __m128i r2 = _mm_sub_epi16(q1, q2);
-          const __m128i r3 = _mm_sub_epi16(q0, q3);
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res00 = _mm_packs_epi32(w0, w1);
-          res08 = _mm_packs_epi32(w2, w3);
-          res04 = _mm_packs_epi32(w4, w5);
-          res12 = _mm_packs_epi32(w6, w7);
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-          // Combine
-          const __m128i r0 = _mm_packs_epi32(s0, s1);
-          const __m128i r1 = _mm_packs_epi32(s2, s3);
-          // Add/subtract
-          const __m128i x0 = _mm_add_epi16(q4, r0);
-          const __m128i x1 = _mm_sub_epi16(q4, r0);
-          const __m128i x2 = _mm_sub_epi16(q7, r1);
-          const __m128i x3 = _mm_add_epi16(q7, r1);
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res02 = _mm_packs_epi32(w0, w1);
-          res14 = _mm_packs_epi32(w2, w3);
-          res10 = _mm_packs_epi32(w4, w5);
-          res06 = _mm_packs_epi32(w6, w7);
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_2 = _mm_packs_epi32(w0, w1);
-          step2_3 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_5 = _mm_packs_epi32(w0, w1);
-          step2_4 = _mm_packs_epi32(w2, w3);
-        }
-        // step 3
-        {
-          step3_0 = _mm_add_epi16(step1_0, step2_3);
-          step3_1 = _mm_add_epi16(step1_1, step2_2);
-          step3_2 = _mm_sub_epi16(step1_1, step2_2);
-          step3_3 = _mm_sub_epi16(step1_0, step2_3);
-          step3_4 = _mm_sub_epi16(step1_7, step2_4);
-          step3_5 = _mm_sub_epi16(step1_6, step2_5);
-          step3_6 = _mm_add_epi16(step1_6, step2_5);
-          step3_7 = _mm_add_epi16(step1_7, step2_4);
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_1 = _mm_packs_epi32(w0, w1);
-          step2_2 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          step2_6 = _mm_packs_epi32(w0, w1);
-          step2_5 = _mm_packs_epi32(w2, w3);
-        }
-        // step 5
-        {
-          step1_0 = _mm_add_epi16(step3_0, step2_1);
-          step1_1 = _mm_sub_epi16(step3_0, step2_1);
-          step1_2 = _mm_add_epi16(step3_3, step2_2);
-          step1_3 = _mm_sub_epi16(step3_3, step2_2);
-          step1_4 = _mm_sub_epi16(step3_4, step2_5);
-          step1_5 = _mm_add_epi16(step3_4, step2_5);
-          step1_6 = _mm_sub_epi16(step3_7, step2_6);
-          step1_7 = _mm_add_epi16(step3_7, step2_6);
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res01 = _mm_packs_epi32(w0, w1);
-          res09 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res05 = _mm_packs_epi32(w0, w1);
-          res13 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res11 = _mm_packs_epi32(w0, w1);
-          res03 = _mm_packs_epi32(w2, w3);
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          // Combine
-          res15 = _mm_packs_epi32(w0, w1);
-          res07 = _mm_packs_epi32(w2, w3);
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      {
-        // 00 01 02 03 04 05 06 07
-        // 10 11 12 13 14 15 16 17
-        // 20 21 22 23 24 25 26 27
-        // 30 31 32 33 34 35 36 37
-        // 40 41 42 43 44 45 46 47
-        // 50 51 52 53 54 55 56 57
-        // 60 61 62 63 64 65 66 67
-        // 70 71 72 73 74 75 76 77
-        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
-        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
-        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
-        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
-        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
-        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
-        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
-        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
-        // 00 10 01 11 02 12 03 13
-        // 20 30 21 31 22 32 23 33
-        // 04 14 05 15 06 16 07 17
-        // 24 34 25 35 26 36 27 37
-        // 40 50 41 51 42 52 43 53
-        // 60 70 61 71 62 72 63 73
-        // 54 54 55 55 56 56 57 57
-        // 64 74 65 75 66 76 67 77
-        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-        // 00 10 20 30 01 11 21 31
-        // 40 50 60 70 41 51 61 71
-        // 02 12 22 32 03 13 23 33
-        // 42 52 62 72 43 53 63 73
-        // 04 14 24 34 05 15 21 36
-        // 44 54 64 74 45 55 61 76
-        // 06 16 26 36 07 17 27 37
-        // 46 56 66 76 47 57 67 77
-        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-        // 00 10 20 30 40 50 60 70
-        // 01 11 21 31 41 51 61 71
-        // 02 12 22 32 42 52 62 72
-        // 03 13 23 33 43 53 63 73
-        // 04 14 24 34 44 54 64 74
-        // 05 15 25 35 45 55 65 75
-        // 06 16 26 36 46 56 66 76
-        // 07 17 27 37 47 57 67 77
-        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
-        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
-        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
-        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
-        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
-        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
-        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
-        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
-      }
-      {
-        // 00 01 02 03 04 05 06 07
-        // 10 11 12 13 14 15 16 17
-        // 20 21 22 23 24 25 26 27
-        // 30 31 32 33 34 35 36 37
-        // 40 41 42 43 44 45 46 47
-        // 50 51 52 53 54 55 56 57
-        // 60 61 62 63 64 65 66 67
-        // 70 71 72 73 74 75 76 77
-        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
-        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
-        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
-        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
-        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
-        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
-        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
-        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
-        // 00 10 01 11 02 12 03 13
-        // 20 30 21 31 22 32 23 33
-        // 04 14 05 15 06 16 07 17
-        // 24 34 25 35 26 36 27 37
-        // 40 50 41 51 42 52 43 53
-        // 60 70 61 71 62 72 63 73
-        // 54 54 55 55 56 56 57 57
-        // 64 74 65 75 66 76 67 77
-        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-        // 00 10 20 30 01 11 21 31
-        // 40 50 60 70 41 51 61 71
-        // 02 12 22 32 03 13 23 33
-        // 42 52 62 72 43 53 63 73
-        // 04 14 24 34 05 15 21 36
-        // 44 54 64 74 45 55 61 76
-        // 06 16 26 36 07 17 27 37
-        // 46 56 66 76 47 57 67 77
-        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-        // 00 10 20 30 40 50 60 70
-        // 01 11 21 31 41 51 61 71
-        // 02 12 22 32 42 52 62 72
-        // 03 13 23 33 43 53 63 73
-        // 04 14 24 34 44 54 64 74
-        // 05 15 25 35 45 55 65 75
-        // 06 16 26 36 46 56 66 76
-        // 07 17 27 37 47 57 67 77
-        // Store results
-        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
-        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
-        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
-        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
-        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
-        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
-        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
-        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
-      }
-      out += 8*16;
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-    out = output;
-  }
+  store_output(&in1, output);
 }
 
 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
@@ -1892,7 +1317,7 @@ static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
 }
 
-static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
+static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
                                       __m128i *in1, int stride) {
   // write first 8 columns
   write_buffer_8x8(output, in0, stride);
@@ -1903,6 +1328,23 @@ static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
 }
 
+static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8(res0, res0);
+  array_transpose_8x8(res1, tbuf);
+  array_transpose_8x8(res0 + 8, res1);
+  array_transpose_8x8(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   // perform rounding operations
   right_shift_8x8(res0, 2);
@@ -1914,7 +1356,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
 void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -2261,8 +1703,8 @@ void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2715,7 +2157,7 @@ void fadst16_sse2(__m128i *in0, __m128i *in1) {
   array_transpose_16x16(in0, in1);
 }
 
-void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
+void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output,
                        int stride, int tx_type) {
   __m128i in0[16], in1[16];
 
@@ -2750,7 +2192,8 @@ void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
   }
 }
 
-void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
+                          int stride) {
   __m128i in0, in1, in2, in3;
   __m128i u0, u1;
   __m128i sum = _mm_setzero_si128();
@@ -2818,17 +2261,167 @@ void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
 
   in1 = _mm_add_epi32(sum, in0);
   in1 = _mm_srai_epi32(in1, 3);
-  _mm_store_si128((__m128i *)(output), in1);
+  store_output(&in1, output);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+/* These SSE2 versions of the FHT functions only actually use SSE2 in the
+ * DCT_DCT case in all other cases, they revert to C code which is identical
+ * to that used by the C versions of them.
+ */
+
+void vp9_highbd_fht4x4_sse2(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_highbd_fdct4x4_sse2(input, output, stride);
+  } else {
+    tran_low_t out[4 * 4];
+    tran_low_t *outptr = &out[0];
+    int i, j;
+    tran_low_t temp_in[4], temp_out[4];
+    const transform_2d ht = FHT_4[tx_type];
+
+    // Columns
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = input[j * stride + i] * 16;
+      if (i == 0 && temp_in[0])
+        temp_in[0] += 1;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        outptr[j * 4 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j)
+        temp_in[j] = out[j + i * 4];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 4; ++j)
+        output[j + i * 4] = (temp_out[j] + 1) >> 2;
+    }
+  }
+}
+
+void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_highbd_fdct8x8_sse2(input, output, stride);
+  } else {
+    tran_low_t out[64];
+    tran_low_t *outptr = &out[0];
+    int i, j;
+    tran_low_t temp_in[8], temp_out[8];
+    const transform_2d ht = FHT_8[tx_type];
+
+    // Columns
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        outptr[j * 8 + i] = temp_out[j];
+    }
+
+    // Rows
+    for (i = 0; i < 8; ++i) {
+      for (j = 0; j < 8; ++j)
+        temp_in[j] = out[j + i * 8];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 8; ++j)
+        output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
+    }
+  }
+}
+
+void vp9_highbd_fht16x16_sse2(int16_t *input, tran_low_t *output,
+                              int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vp9_highbd_fdct16x16_sse2(input, output, stride);
+  } else {
+    tran_low_t out[256];
+    tran_low_t *outptr = &out[0];
+    int i, j;
+    tran_low_t temp_in[16], temp_out[16];
+    const transform_2d ht = FHT_16[tx_type];
+
+    // Columns
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 16; ++i) {
+      for (j = 0; j < 16; ++j)
+        temp_in[j] = out[j + i * 16];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 16; ++j)
+        output[j + i * 16] = temp_out[j];
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+/*
+ * The DCTnxn functions are defined using the macros below. The main code for
+ * them is in separate files (vp9/encoder/x86/vp9_dct_impl_sse2.c &
+ * vp9/encoder/x86/vp9_dct32x32_sse2.c) which are used by both the 8 bit code
+ * and the high bit depth code.
+ */
+
+#define DCT_HIGH_BIT_DEPTH 0
+
+#define FDCT4x4_2D vp9_fdct4x4_sse2
+#define FDCT8x8_2D vp9_fdct8x8_sse2
+#define FDCT16x16_2D vp9_fdct16x16_sse2
+#include "vp9/encoder/x86/vp9_dct_impl_sse2.c"
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
-#undef  FDCT32x32_HIGH_PRECISION
 #undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
 
 #define FDCT32x32_2D vp9_fdct32x32_sse2
 #define FDCT32x32_HIGH_PRECISION 1
 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
+#undef  FDCT32x32_2D
 #undef  FDCT32x32_HIGH_PRECISION
+
+#undef  DCT_HIGH_BIT_DEPTH
+
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+#define DCT_HIGH_BIT_DEPTH 1
+
+#define FDCT4x4_2D vp9_highbd_fdct4x4_sse2
+#define FDCT8x8_2D vp9_highbd_fdct8x8_sse2
+#define FDCT16x16_2D vp9_highbd_fdct16x16_sse2
+#include "vp9/encoder/x86/vp9_dct_impl_sse2.c" // NOLINT
+#undef  FDCT4x4_2D
+#undef  FDCT8x8_2D
+#undef  FDCT16x16_2D
+
+#define FDCT32x32_2D vp9_highbd_fdct32x32_rd_sse2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
 #undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D vp9_highbd_fdct32x32_sse2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
+#undef  FDCT32x32_2D
+#undef  FDCT32x32_HIGH_PRECISION
+
+#undef  DCT_HIGH_BIT_DEPTH
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h
new file mode 100644
index 00000000000..b99db923ef0
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.h
@@ -0,0 +1,464 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+#define VP9_ENCODER_X86_VP9_DCT_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
+void vp9_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+void vp9_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output,
+                            int stride);
+void vp9_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output,
+                               int stride);
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+                                          const __m128i *preg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16(0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+                              _mm_cmpeq_epi16(*preg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+                              _mm_cmpeq_epi16(*preg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3,
+                                          const __m128i *preg4,
+                                          const __m128i *preg5,
+                                          const __m128i *preg6,
+                                          const __m128i *preg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0)
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1)
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *preg8,
+                                           const __m128i *preg9,
+                                           const __m128i *preg10,
+                                           const __m128i *preg11,
+                                           const __m128i *preg12,
+                                           const __m128i *preg13,
+                                           const __m128i *preg14,
+                                           const __m128i *preg15,
+                                           const __m128i *preg16,
+                                           const __m128i *preg17,
+                                           const __m128i *preg18,
+                                           const __m128i *preg19,
+                                           const __m128i *preg20,
+                                           const __m128i *preg21,
+                                           const __m128i *preg22,
+                                           const __m128i *preg23,
+                                           const __m128i *preg24,
+                                           const __m128i *preg25,
+                                           const __m128i *preg26,
+                                           const __m128i *preg27,
+                                           const __m128i *preg28,
+                                           const __m128i *preg29,
+                                           const __m128i *preg30,
+                                           const __m128i *preg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE int k_check_epi32_overflow_4(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *zero) {
+  __m128i minus_one = _mm_set1_epi32(-1);
+  // Check for overflows
+  __m128i reg0_shifted = _mm_slli_epi64(*preg0, 1);
+  __m128i reg1_shifted = _mm_slli_epi64(*preg1, 1);
+  __m128i reg2_shifted = _mm_slli_epi64(*preg2, 1);
+  __m128i reg3_shifted = _mm_slli_epi64(*preg3, 1);
+  __m128i reg0_top_dwords = _mm_shuffle_epi32(
+      reg0_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg1_top_dwords = _mm_shuffle_epi32(
+      reg1_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg2_top_dwords = _mm_shuffle_epi32(
+      reg2_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i reg3_top_dwords = _mm_shuffle_epi32(
+      reg3_shifted, _MM_SHUFFLE(0, 0, 3, 1));
+  __m128i top_dwords_01 = _mm_unpacklo_epi64(reg0_top_dwords, reg1_top_dwords);
+  __m128i top_dwords_23 = _mm_unpacklo_epi64(reg2_top_dwords, reg3_top_dwords);
+  __m128i valid_positve_01 = _mm_cmpeq_epi32(top_dwords_01, *zero);
+  __m128i valid_positve_23 = _mm_cmpeq_epi32(top_dwords_23, *zero);
+  __m128i valid_negative_01 = _mm_cmpeq_epi32(top_dwords_01, minus_one);
+  __m128i valid_negative_23 = _mm_cmpeq_epi32(top_dwords_23, minus_one);
+  int overflow_01 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_01, valid_negative_01));
+  int overflow_23 = _mm_movemask_epi8(
+      _mm_cmpeq_epi32(valid_positve_23, valid_negative_23));
+  return (overflow_01 + overflow_23);
+}
+
+static INLINE int k_check_epi32_overflow_8(const __m128i *preg0,
+                                           const __m128i *preg1,
+                                           const __m128i *preg2,
+                                           const __m128i *preg3,
+                                           const __m128i *preg4,
+                                           const __m128i *preg5,
+                                           const __m128i *preg6,
+                                           const __m128i *preg7,
+                                           const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_16(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11,
+                                          zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE int k_check_epi32_overflow_32(const __m128i *preg0,
+                                            const __m128i *preg1,
+                                            const __m128i *preg2,
+                                            const __m128i *preg3,
+                                            const __m128i *preg4,
+                                            const __m128i *preg5,
+                                            const __m128i *preg6,
+                                            const __m128i *preg7,
+                                            const __m128i *preg8,
+                                            const __m128i *preg9,
+                                            const __m128i *preg10,
+                                            const __m128i *preg11,
+                                            const __m128i *preg12,
+                                            const __m128i *preg13,
+                                            const __m128i *preg14,
+                                            const __m128i *preg15,
+                                            const __m128i *preg16,
+                                            const __m128i *preg17,
+                                            const __m128i *preg18,
+                                            const __m128i *preg19,
+                                            const __m128i *preg20,
+                                            const __m128i *preg21,
+                                            const __m128i *preg22,
+                                            const __m128i *preg23,
+                                            const __m128i *preg24,
+                                            const __m128i *preg25,
+                                            const __m128i *preg26,
+                                            const __m128i *preg27,
+                                            const __m128i *preg28,
+                                            const __m128i *preg29,
+                                            const __m128i *preg30,
+                                            const __m128i *preg31,
+                                            const __m128i *zero) {
+  int overflow = k_check_epi32_overflow_4(preg0, preg1, preg2, preg3, zero);
+  if (!overflow) {
+    overflow = k_check_epi32_overflow_4(preg4, preg5, preg6, preg7, zero);
+    if (!overflow) {
+      overflow = k_check_epi32_overflow_4(preg8, preg9, preg10, preg11, zero);
+      if (!overflow) {
+        overflow = k_check_epi32_overflow_4(preg12, preg13, preg14, preg15,
+                                            zero);
+        if (!overflow) {
+          overflow = k_check_epi32_overflow_4(preg16, preg17, preg18, preg19,
+                                              zero);
+          if (!overflow) {
+            overflow = k_check_epi32_overflow_4(preg20, preg21,
+                                                preg22, preg23, zero);
+            if (!overflow) {
+              overflow = k_check_epi32_overflow_4(preg24, preg25,
+                                                  preg26, preg27, zero);
+              if (!overflow) {
+                overflow = k_check_epi32_overflow_4(preg28, preg29,
+                                                    preg30, preg31, zero);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return overflow;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t* dst_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+#else
+  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+
+static INLINE __m128i mult_round_shift(const __m128i *pin0,
+                                       const __m128i *pin1,
+                                       const __m128i *pmultiplier,
+                                       const __m128i *prounding,
+                                       const int shift) {
+  const __m128i u0 = _mm_madd_epi16(*pin0, *pmultiplier);
+  const __m128i u1 = _mm_madd_epi16(*pin1, *pmultiplier);
+  const __m128i v0 = _mm_add_epi32(u0, *prounding);
+  const __m128i v1 = _mm_add_epi32(u1, *prounding);
+  const __m128i w0 = _mm_srai_epi32(v0, shift);
+  const __m128i w1 = _mm_srai_epi32(v1, shift);
+  return _mm_packs_epi32(w0, w1);
+}
+
+static INLINE void transpose_and_output8x8(
+    const __m128i *pin00, const __m128i *pin01,
+    const __m128i *pin02, const __m128i *pin03,
+    const __m128i *pin04, const __m128i *pin05,
+    const __m128i *pin06, const __m128i *pin07,
+    const int pass, int16_t* out0_ptr,
+    tran_low_t* out1_ptr) {
+  // 00 01 02 03 04 05 06 07
+  // 10 11 12 13 14 15 16 17
+  // 20 21 22 23 24 25 26 27
+  // 30 31 32 33 34 35 36 37
+  // 40 41 42 43 44 45 46 47
+  // 50 51 52 53 54 55 56 57
+  // 60 61 62 63 64 65 66 67
+  // 70 71 72 73 74 75 76 77
+  const __m128i tr0_0 = _mm_unpacklo_epi16(*pin00, *pin01);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(*pin02, *pin03);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(*pin00, *pin01);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(*pin02, *pin03);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(*pin04, *pin05);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(*pin06, *pin07);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(*pin04, *pin05);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(*pin06, *pin07);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 54 54 55 55 56 56 57 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 21 36
+  // 44 54 64 74 45 55 61 76
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+  const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+  const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+  const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+  const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+  if (pass == 0) {
+    _mm_storeu_si128((__m128i*)(out0_ptr + 0 * 16), tr2_0);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 1 * 16), tr2_1);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 2 * 16), tr2_2);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 3 * 16), tr2_3);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 4 * 16), tr2_4);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 5 * 16), tr2_5);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 6 * 16), tr2_6);
+    _mm_storeu_si128((__m128i*)(out0_ptr + 7 * 16), tr2_7);
+  } else {
+    storeu_output(&tr2_0, (out1_ptr + 0 * 16));
+    storeu_output(&tr2_1, (out1_ptr + 1 * 16));
+    storeu_output(&tr2_2, (out1_ptr + 2 * 16));
+    storeu_output(&tr2_3, (out1_ptr + 3 * 16));
+    storeu_output(&tr2_4, (out1_ptr + 4 * 16));
+    storeu_output(&tr2_5, (out1_ptr + 5 * 16));
+    storeu_output(&tr2_6, (out1_ptr + 6 * 16));
+    storeu_output(&tr2_7, (out1_ptr + 7 * 16));
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP9_ENCODER_X86_VP9_DCT_SSE2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
new file mode 100644
index 00000000000..1c1005aeeda
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -0,0 +1,469 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER) && _MSC_VER <= 1500
+// Need to include math.h before calling tmmintrin.h/intrin.h
+// in certain versions of MSVS.
+#include <math.h>
+#endif
+#include <tmmintrin.h>  // SSSE3
+#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
+
+void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride,
+                             int16_t* coeff_ptr, intptr_t n_coeffs,
+                             int skip_block, const int16_t* zbin_ptr,
+                             const int16_t* round_ptr, const int16_t* quant_ptr,
+                             const int16_t* quant_shift_ptr,
+                             int16_t* qcoeff_ptr,
+                             int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                             uint16_t* eob_ptr,
+                             const int16_t* scan_ptr,
+                             const int16_t* iscan_ptr) {
+  __m128i zero;
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  __m128i *in[8];
+  int index = 0;
+
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+  (void)coeff_ptr;
+
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  in[0] = &in0;
+  in[1] = &in1;
+  in[2] = &in2;
+  in[3] = &in3;
+  in[4] = &in4;
+  in[5] = &in5;
+  in[6] = &in6;
+  in[7] = &in7;
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_sub_epi16(q6, q5);
+      const __m128i d1 = _mm_add_epi16(q6, q5);
+      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
+      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
+
+      // Add/subtract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+  }
+
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant, thr;
+    int16_t nzflag;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = *in[0];
+        coeff1 = *in[1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // AC only loop
+    index = 2;
+    thr = _mm_srai_epi16(dequant, 1);
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
+        coeff0 = *in[index];
+        coeff1 = *in[index + 1];
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+      index += 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 28458dcdd52..3a29aba6f27 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
@@ -179,4 +179,77 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   mova              [outputq + 112], m7
 
   RET
+
+%macro HMD8_1D 0
+  psubw              m8, m0, m1
+  psubw              m9, m2, m3
+  paddw              m0, m1
+  paddw              m2, m3
+  SWAP               1, 8
+  SWAP               3, 9
+  psubw              m8, m4, m5
+  psubw              m9, m6, m7
+  paddw              m4, m5
+  paddw              m6, m7
+  SWAP               5, 8
+  SWAP               7, 9
+
+  psubw              m8, m0, m2
+  psubw              m9, m1, m3
+  paddw              m0, m2
+  paddw              m1, m3
+  SWAP               2, 8
+  SWAP               3, 9
+  psubw              m8, m4, m6
+  psubw              m9, m5, m7
+  paddw              m4, m6
+  paddw              m5, m7
+  SWAP               6, 8
+  SWAP               7, 9
+
+  psubw              m8, m0, m4
+  psubw              m9, m1, m5
+  paddw              m0, m4
+  paddw              m1, m5
+  SWAP               4, 8
+  SWAP               5, 9
+  psubw              m8, m2, m6
+  psubw              m9, m3, m7
+  paddw              m2, m6
+  paddw              m3, m7
+  SWAP               6, 8
+  SWAP               7, 9
+%endmacro
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  HMD8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  HMD8_1D
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
 %endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
index bf5fa889f22..bf7c7af7707 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -23,18 +23,17 @@
 // Compute the sum of all pixel differences of this MB.
 static INLINE int sum_diff_16x1(__m128i acc_diff) {
   const __m128i k_1 = _mm_set1_epi16(1);
-  const __m128i acc_diff_lo = _mm_srai_epi16(
-        _mm_unpacklo_epi8(acc_diff, acc_diff), 8);
-  const __m128i acc_diff_hi = _mm_srai_epi16(
-        _mm_unpackhi_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_lo =
+      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
+  const __m128i acc_diff_hi =
+      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
   const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
   const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
-  const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
-                                          _mm_srli_si128(hg_fe_dc_ba, 8));
-  const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
-                                         _mm_srli_si128(hgfe_dcba, 4));
-  int sum_diff = _mm_cvtsi128_si32(hgfedcba);
-  return sum_diff;
+  const __m128i hgfe_dcba =
+      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
+  const __m128i hgfedcba =
+      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
+  return _mm_cvtsi128_si32(hgfedcba);
 }
 
 // Denoise a 16x1 vector.
@@ -51,8 +50,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
                                              __m128i acc_diff) {
   // Calculate differences
   const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
-  const __m128i v_mc_running_avg_y = _mm_loadu_si128(
-                                     (const __m128i *)(&mc_running_avg_y[0]));
+  const __m128i v_mc_running_avg_y =
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
   __m128i v_running_avg_y;
   const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
   const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
@@ -60,8 +59,8 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
   const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
   // Clamp absolute difference to 16 to be used to get mask. Doing this
   // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
-  const __m128i clamped_absdiff = _mm_min_epu8(
-                                  _mm_or_si128(pdiff, ndiff), *k_16);
+  const __m128i clamped_absdiff =
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
   // Get masks for l2 l1 and l0 adjustments.
   const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
   const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
@@ -95,24 +94,22 @@ static INLINE __m128i vp9_denoiser_16x1_sse2(const uint8_t *sig,
 }
 
 // Denoise a 16x1 vector with a weaker filter.
-static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig,
-                                              const uint8_t *mc_running_avg_y,
-                                              uint8_t *running_avg_y,
-                                              const __m128i k_0,
-                                              const __m128i k_delta,
-                                              __m128i acc_diff) {
+static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
+    const uint8_t *sig, const uint8_t *mc_running_avg_y,
+    uint8_t *running_avg_y, const __m128i k_0,
+    const __m128i k_delta, __m128i acc_diff) {
   __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
   // Calculate differences.
   const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
   const __m128i v_mc_running_avg_y =
-                _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
+      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
   const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
   const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
   // Obtain the sign. FF if diff is negative.
   const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
   // Clamp absolute difference to delta to get the adjustment.
   const __m128i adj =
-                _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
+      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
   // Restore the sign and get positive and negative adjustments.
   __m128i padj, nadj;
   padj = _mm_andnot_si128(diff_sign, adj);
@@ -128,19 +125,16 @@ static INLINE __m128i vp9_denoiser_adj_16x1_sse2(const uint8_t *sig,
   return acc_diff;
 }
 
-static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride,
-                                 const uint8_t *mc_running_avg_y,
-                                 int mc_avg_y_stride,
-                                 uint8_t *running_avg_y, int avg_y_stride,
-                                 int increase_denoising,
-                                 BLOCK_SIZE bs,
-                                 int motion_magnitude) {
-  int sum_diff_thresh;
-  int r;
-  int shift_inc  = (increase_denoising &&
-                   motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
-  unsigned char sig_buffer[2][16], mc_running_buffer[2][16],
-                running_buffer[2][16];
+// Denoiser for 4xM and 8xM blocks.
+static int vp9_denoiser_NxM_sse2_small(
+    const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
+    int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
+    int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
+  int sum_diff_thresh, r, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
+  uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
   __m128i acc_diff = _mm_setzero_si128();
   const __m128i k_0 = _mm_setzero_si128();
   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
@@ -148,145 +142,50 @@ static int vp9_denoiser_4xM_sse2(const uint8_t *sig, int sig_stride,
   const __m128i k_16 = _mm_set1_epi8(16);
   // Modify each level's adjustment according to motion_magnitude.
   const __m128i l3 = _mm_set1_epi8(
-                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
-                     7 + shift_inc : 6);
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
   // Difference between level 3 and level 2 is 2.
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
-  int sum_diff = 0;
-
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
-    vpx_memcpy(sig_buffer[r], sig, 4);
-    vpx_memcpy(sig_buffer[r] + 4, sig + sig_stride, 4);
-    vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride * 2, 4);
-    vpx_memcpy(sig_buffer[r] + 12, sig + sig_stride * 3, 4);
-    vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 4);
-    vpx_memcpy(mc_running_buffer[r] + 4, mc_running_avg_y +
-               mc_avg_y_stride, 4);
-    vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
-               mc_avg_y_stride * 2, 4);
-    vpx_memcpy(mc_running_buffer[r] + 12, mc_running_avg_y +
-               mc_avg_y_stride * 3, 4);
-    vpx_memcpy(running_buffer[r], running_avg_y, 4);
-    vpx_memcpy(running_buffer[r] + 4, running_avg_y +
-               avg_y_stride, 4);
-    vpx_memcpy(running_buffer[r] + 8, running_avg_y +
-               avg_y_stride * 2, 4);
-    vpx_memcpy(running_buffer[r] + 12, running_avg_y +
-               avg_y_stride * 3, 4);
-    acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
-                                      mc_running_buffer[r],
-                                      running_buffer[r],
-                                      &k_0, &k_4, &k_8, &k_16,
-                                      &l3, &l32, &l21, acc_diff);
-    vpx_memcpy(running_avg_y, running_buffer[r], 4);
-    vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
-    vpx_memcpy(running_avg_y + avg_y_stride * 2,
-               running_buffer[r] + 8, 4);
-    vpx_memcpy(running_avg_y + avg_y_stride * 3,
-               running_buffer[r] + 12, 4);
-    // Update pointers for next iteration.
-    sig += (sig_stride << 2);
-    mc_running_avg_y += (mc_avg_y_stride << 2);
-    running_avg_y += (avg_y_stride << 2);
-  }
-
-  {
-    sum_diff = sum_diff_16x1(acc_diff);
-    sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
-    if (abs(sum_diff) > sum_diff_thresh) {
-      // Before returning to copy the block (i.e., apply no denoising),
-      // checK if we can still apply some (weaker) temporal filtering to
-      // this block, that would otherwise not be denoised at all. Simplest
-      // is to apply an additional adjustment to running_avg_y to bring it
-      // closer to sig. The adjustment is capped by a maximum delta, and
-      // chosen such that in most cases the resulting sum_diff will be
-      // within the accceptable range given by sum_diff_thresh.
+  const uint8_t shift = (width == 4) ? 2 : 1;
 
-      // The delta is set by the excess of absolute pixel diff over the
-      // threshold.
-      int delta = ((abs(sum_diff) - sum_diff_thresh)
-                  >> num_pels_log2_lookup[bs]) + 1;
-      // Only apply the adjustment for max delta up to 3.
-      if (delta < 4) {
-        const __m128i k_delta = _mm_set1_epi8(delta);
-        running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        sum_diff = 0;
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 2); ++r) {
-          acc_diff = vp9_denoiser_adj_16x1_sse2(
-                             sig_buffer[r], mc_running_buffer[r],
-                             running_buffer[r], k_0, k_delta,
-                             acc_diff);
-          vpx_memcpy(running_avg_y, running_buffer[r], 4);
-          vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 4, 4);
-          vpx_memcpy(running_avg_y + avg_y_stride * 2,
-                     running_buffer[r] + 8, 4);
-          vpx_memcpy(running_avg_y + avg_y_stride * 3,
-                     running_buffer[r] + 12, 4);
-          // Update pointers for next iteration.
-          running_avg_y += (avg_y_stride << 2);
-        }
-        sum_diff = sum_diff_16x1(acc_diff);
-        if (abs(sum_diff) > sum_diff_thresh) {
-          return COPY_BLOCK;
-        }
-      } else {
-        return COPY_BLOCK;
-      }
+  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
+    memcpy(sig_buffer[r], sig, width);
+    memcpy(sig_buffer[r] + width, sig + sig_stride, width);
+    memcpy(mc_running_buffer[r], mc_running_avg_y, width);
+    memcpy(mc_running_buffer[r] + width,
+           mc_running_avg_y + mc_avg_y_stride, width);
+    memcpy(running_buffer[r], running_avg_y, width);
+    memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
+    if (width == 4) {
+      memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
+      memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
+      memcpy(mc_running_buffer[r] + width * 2,
+             mc_running_avg_y + mc_avg_y_stride * 2, width);
+      memcpy(mc_running_buffer[r] + width * 3,
+             mc_running_avg_y + mc_avg_y_stride * 3, width);
+      memcpy(running_buffer[r] + width * 2,
+             running_avg_y + avg_y_stride * 2, width);
+      memcpy(running_buffer[r] + width * 3,
+             running_avg_y + avg_y_stride * 3, width);
     }
-  }
-  return FILTER_BLOCK;
-}
-
-static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride,
-                                 const uint8_t *mc_running_avg_y,
-                                 int mc_avg_y_stride,
-                                 uint8_t *running_avg_y, int avg_y_stride,
-                                 int increase_denoising,
-                                 BLOCK_SIZE bs,
-                                 int motion_magnitude) {
-  int sum_diff_thresh;
-  int r;
-  int shift_inc  = (increase_denoising &&
-                  motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
-  unsigned char sig_buffer[8][16], mc_running_buffer[8][16],
-                running_buffer[8][16];
-  __m128i acc_diff = _mm_setzero_si128();
-  const __m128i k_0 = _mm_setzero_si128();
-  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
-  const __m128i k_8 = _mm_set1_epi8(8);
-  const __m128i k_16 = _mm_set1_epi8(16);
-  // Modify each level's adjustment according to motion_magnitude.
-  const __m128i l3 = _mm_set1_epi8(
-                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
-                      7 + shift_inc : 6);
-  // Difference between level 3 and level 2 is 2.
-  const __m128i l32 = _mm_set1_epi8(2);
-  // Difference between level 2 and level 1 is 1.
-  const __m128i l21 = _mm_set1_epi8(1);
-  int sum_diff = 0;
-
-  for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
-    vpx_memcpy(sig_buffer[r], sig, 8);
-    vpx_memcpy(sig_buffer[r] + 8, sig + sig_stride, 8);
-    vpx_memcpy(mc_running_buffer[r], mc_running_avg_y, 8);
-    vpx_memcpy(mc_running_buffer[r] + 8, mc_running_avg_y +
-               mc_avg_y_stride, 8);
-    vpx_memcpy(running_buffer[r], running_avg_y, 8);
-    vpx_memcpy(running_buffer[r] + 8, running_avg_y +
-               avg_y_stride, 8);
     acc_diff = vp9_denoiser_16x1_sse2(sig_buffer[r],
                                       mc_running_buffer[r],
                                       running_buffer[r],
                                       &k_0, &k_4, &k_8, &k_16,
                                       &l3, &l32, &l21, acc_diff);
-    vpx_memcpy(running_avg_y, running_buffer[r], 8);
-    vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+    memcpy(running_avg_y, running_buffer[r], width);
+    memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
+    if (width == 4) {
+      memcpy(running_avg_y + avg_y_stride * 2,
+             running_buffer[r] + width * 2, width);
+      memcpy(running_avg_y + avg_y_stride * 3,
+             running_buffer[r] + width * 3, width);
+    }
     // Update pointers for next iteration.
-    sig += (sig_stride << 1);
-    mc_running_avg_y += (mc_avg_y_stride << 1);
-    running_avg_y += (avg_y_stride << 1);
+    sig += (sig_stride << shift);
+    mc_running_avg_y += (mc_avg_y_stride << shift);
+    running_avg_y += (avg_y_stride << shift);
   }
 
   {
@@ -294,54 +193,61 @@ static int vp9_denoiser_8xM_sse2(const uint8_t *sig, int sig_stride,
     sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
     if (abs(sum_diff) > sum_diff_thresh) {
       // Before returning to copy the block (i.e., apply no denoising),
-      // checK if we can still apply some (weaker) temporal filtering to
+      // check if we can still apply some (weaker) temporal filtering to
       // this block, that would otherwise not be denoised at all. Simplest
       // is to apply an additional adjustment to running_avg_y to bring it
       // closer to sig. The adjustment is capped by a maximum delta, and
       // chosen such that in most cases the resulting sum_diff will be
-      // within the accceptable range given by sum_diff_thresh.
+      // within the acceptable range given by sum_diff_thresh.
 
       // The delta is set by the excess of absolute pixel diff over the
       // threshold.
-      int delta = ((abs(sum_diff) - sum_diff_thresh)
-                  >> num_pels_log2_lookup[bs]) + 1;
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
         const __m128i k_delta = _mm_set1_epi8(delta);
         running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
-        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
+        for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
           acc_diff = vp9_denoiser_adj_16x1_sse2(
-                           sig_buffer[r], mc_running_buffer[r],
-                           running_buffer[r], k_0, k_delta,
-                           acc_diff);
-          vpx_memcpy(running_avg_y, running_buffer[r], 8);
-          vpx_memcpy(running_avg_y + avg_y_stride, running_buffer[r] + 8, 8);
+              sig_buffer[r], mc_running_buffer[r], running_buffer[r],
+              k_0, k_delta, acc_diff);
+          memcpy(running_avg_y, running_buffer[r], width);
+          memcpy(running_avg_y + avg_y_stride,
+                 running_buffer[r] + width, width);
+          if (width == 4) {
+            memcpy(running_avg_y + avg_y_stride * 2,
+                   running_buffer[r] + width * 2, width);
+            memcpy(running_avg_y + avg_y_stride * 3,
+                   running_buffer[r] + width * 3, width);
+          }
           // Update pointers for next iteration.
-          running_avg_y += (avg_y_stride << 1);
+          running_avg_y += (avg_y_stride << shift);
         }
         sum_diff = sum_diff_16x1(acc_diff);
         if (abs(sum_diff) > sum_diff_thresh) {
           return COPY_BLOCK;
         }
       } else {
-          return COPY_BLOCK;
+        return COPY_BLOCK;
       }
     }
   }
   return FILTER_BLOCK;
 }
 
-static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
-                                        const uint8_t *mc_running_avg_y,
-                                        int mc_avg_y_stride,
-                                        uint8_t *running_avg_y,
-                                        int avg_y_stride,
-                                        int increase_denoising, BLOCK_SIZE bs,
-                                        int motion_magnitude) {
-  int sum_diff_thresh;
-  int r, c;
-  int shift_inc  = (increase_denoising &&
-                   motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 1 : 0;
+// Denoiser for 16xM, 32xM and 64xM blocks
+static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
+                                     const uint8_t *mc_running_avg_y,
+                                     int mc_avg_y_stride,
+                                     uint8_t *running_avg_y,
+                                     int avg_y_stride,
+                                     int increase_denoising, BLOCK_SIZE bs,
+                                     int motion_magnitude) {
+  int sum_diff_thresh, r, c, sum_diff = 0;
+  const int shift_inc  = (increase_denoising &&
+                          motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
+                         1 : 0;
   __m128i acc_diff[4][4];
   const __m128i k_0 = _mm_setzero_si128();
   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
@@ -349,13 +255,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
   const __m128i k_16 = _mm_set1_epi8(16);
   // Modify each level's adjustment according to motion_magnitude.
   const __m128i l3 = _mm_set1_epi8(
-                     (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
-                     7 + shift_inc : 6);
+      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
   // Difference between level 3 and level 2 is 2.
   const __m128i l32 = _mm_set1_epi8(2);
   // Difference between level 2 and level 1 is 1.
   const __m128i l21 = _mm_set1_epi8(1);
-  int sum_diff = 0;
 
   for (c = 0; c < 4; ++c) {
     for (r = 0; r < 4; ++r) {
@@ -363,13 +267,11 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
     }
   }
 
-  for (r = 0; r < (4 << b_height_log2_lookup[bs]); r++) {
+  for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
     for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
       acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
-                               sig, mc_running_avg_y,
-                               running_avg_y,
-                               &k_0, &k_4, &k_8, &k_16,
-                               &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
+          sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
+          &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
       // Update pointers for next iteration.
       sig += 16;
       mc_running_avg_y += 16;
@@ -385,8 +287,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
     // Update pointers for next iteration.
     sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
     mc_running_avg_y = mc_running_avg_y -
-                      16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
-                      mc_avg_y_stride;
+                       16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
+                       mc_avg_y_stride;
     running_avg_y = running_avg_y -
                     16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
                     avg_y_stride;
@@ -395,8 +297,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
   {
     sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
     if (abs(sum_diff) > sum_diff_thresh) {
-      int delta = ((abs(sum_diff) - sum_diff_thresh)
-                  >> num_pels_log2_lookup[bs]) + 1;
+      const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
+                         num_pels_log2_lookup[bs]) + 1;
 
       // Only apply the adjustment for max delta up to 3.
       if (delta < 4) {
@@ -408,9 +310,8 @@ static int vp9_denoiser_64_32_16xM_sse2(const uint8_t *sig, int sig_stride,
         for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
           for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
             acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
-                                             sig, mc_running_avg_y,
-                                             running_avg_y, k_0,
-                                             k_delta, acc_diff[c>>4][r>>4]);
+                sig, mc_running_avg_y, running_avg_y, k_0,
+                k_delta, acc_diff[c>>4][r>>4]);
             // Update pointers for next iteration.
             sig += 16;
             mc_running_avg_y += 16;
@@ -449,25 +350,25 @@ int vp9_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
                              BLOCK_SIZE bs,
                              int motion_magnitude) {
   if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
-    return vp9_denoiser_4xM_sse2(sig, sig_stride,
-                                 mc_avg, mc_avg_stride,
-                                 avg, avg_stride,
-                                 increase_denoising,
-                                 bs, motion_magnitude);
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 4);
   } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
-    return vp9_denoiser_8xM_sse2(sig, sig_stride,
-                                 mc_avg, mc_avg_stride,
-                                 avg, avg_stride,
-                                 increase_denoising,
-                                 bs, motion_magnitude);
+    return vp9_denoiser_NxM_sse2_small(sig, sig_stride,
+                                       mc_avg, mc_avg_stride,
+                                       avg, avg_stride,
+                                       increase_denoising,
+                                       bs, motion_magnitude, 8);
   } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
              bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
              bs == BLOCK_64X32 || bs == BLOCK_64X64) {
-    return vp9_denoiser_64_32_16xM_sse2(sig, sig_stride,
-                                        mc_avg, mc_avg_stride,
-                                        avg, avg_stride,
-                                        increase_denoising,
-                                        bs, motion_magnitude);
+    return vp9_denoiser_NxM_sse2_big(sig, sig_stride,
+                                     mc_avg, mc_avg_stride,
+                                     avg, avg_stride,
+                                     increase_denoising,
+                                     bs, motion_magnitude);
   } else {
     return COPY_BLOCK;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
index 1126fdb6164..56373e897c9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_error_sse2.asm
@@ -72,3 +72,49 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
   movd    edx, m5
 %endif
   RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m3, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m3
+  punpckldq m3, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m3
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
new file mode 100644
index 00000000000..c245ccafa8a
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vp9/common/vp9_common.h"
+
+int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz,
+                                    int bps) {
+  int i, j, test;
+  uint32_t temp[4];
+  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+  int64_t error = 0, sqcoeff = 0;
+  const int shift = 2 * (bps - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  for (i = 0; i < block_size; i+=8) {
+    // Load the data into xmm registers
+    __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
+    // Check if any values require more than 15 bit
+    max = _mm_set1_epi32(0x3fff);
+    min = _mm_set1_epi32(0xffffc000);
+    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+            _mm_cmplt_epi32(mm_coeff, min));
+    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+            _mm_cmplt_epi32(mm_coeff2, min));
+    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+            _mm_cmplt_epi32(mm_dqcoeff, min));
+    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+            _mm_cmplt_epi32(mm_dqcoeff2, min));
+    test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
+            _mm_or_si128(cmp2, cmp3)));
+
+    if (!test) {
+      __m128i mm_diff, error_sse2, sqcoeff_sse2;;
+      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+      _mm_storeu_si128((__m128i*)temp, error_sse2);
+      error = error + temp[0] + temp[1] + temp[2] + temp[3];
+      _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
+      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+    } else {
+      for (j = 0; j < 8; j++) {
+        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+        error +=  diff * diff;
+        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+      }
+    }
+  }
+  assert(error >= 0 && sqcoeff >= 0);
+  error = (error + rounding) >> shift;
+  sqcoeff = (sqcoeff + rounding) >> shift;
+
+  *ssz = sqcoeff;
+  return error;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
new file mode 100644
index 00000000000..ffa43b65a59
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_quantize_intrin_sse2.c
@@ -0,0 +1,180 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "vp9/common/vp9_common.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// from vp9_idct.h: typedef int32_t tran_low_t;
+void vp9_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr,
+                                intptr_t count,
+                                int skip_block,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr,
+                                tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr,
+                                uint16_t *eob_ptr,
+                                const int16_t *scan,
+                                const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = ((int)count / 4) - 1; i >= 0; i--) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (test == 0xffff)
+        non_zero_regs--;
+      else
+        break;
+    }
+
+    // Quantization pass:
+    for (i = 0; i < non_zero_regs; i++) {
+      __m128i coeffs, coeffs_sign, tmp1, tmp2;
+      int test;
+      int abs_coeff[4];
+      int coeff_sign[4];
+
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      coeffs_sign = _mm_srai_epi32(coeffs, 31);
+      coeffs = _mm_sub_epi32(
+            _mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+      tmp1 = _mm_or_si128(tmp1, tmp2);
+      test = _mm_movemask_epi8(tmp1);
+      _mm_storeu_si128((__m128i*)abs_coeff, coeffs);
+      _mm_storeu_si128((__m128i*)coeff_sign, coeffs_sign);
+
+      for (j = 0; j < 4; j++) {
+        if (test & (1 << (4 * j))) {
+          int k = 4 * i + j;
+          int64_t tmp = clamp(abs_coeff[j] + round_ptr[k != 0],
+                              INT32_MIN, INT32_MAX);
+          tmp = ((((tmp * quant_ptr[k != 0]) >> 16) + tmp) *
+                    quant_shift_ptr[k != 0]) >> 16;  // quantization
+          qcoeff_ptr[k] = (tmp ^ coeff_sign[j]) - coeff_sign[j];
+          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+          if (tmp)
+            eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+        }
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+
+void vp9_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr,
+                                      intptr_t n_coeffs,
+                                      int skip_block,
+                                      const int16_t *zbin_ptr,
+                                      const int16_t *round_ptr,
+                                      const int16_t *quant_ptr,
+                                      const int16_t *quant_shift_ptr,
+                                      tran_low_t *qcoeff_ptr,
+                                      tran_low_t *dqcoeff_ptr,
+                                      const int16_t *dequant_ptr,
+                                      uint16_t *eob_ptr,
+                                      const int16_t *scan,
+                                      const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp,
+                           zbin1_tmp,
+                           zbin1_tmp,
+                           zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    // Pre-scan pass
+    for (i = 0; i < n_coeffs / 4; i++) {
+      __m128i coeffs, cmp1, cmp2;
+      int test;
+      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+      cmp1 = _mm_and_si128(cmp1, cmp2);
+      test = _mm_movemask_epi8(cmp1);
+      if (!(test & 0xf))
+        idx_arr[idx++] = i * 4;
+      if (!(test & 0xf0))
+        idx_arr[idx++] = i * 4 + 1;
+      if (!(test & 0xf00))
+        idx_arr[idx++] = i * 4 + 2;
+      if (!(test & 0xf000))
+        idx_arr[idx++] = i * 4 + 3;
+    }
+
+    // Quantization pass: only process the coefficients selected in
+    // pre-scan pass. Note: idx can be zero.
+    for (i = 0; i < idx; i++) {
+      const int rc = idx_arr[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      int64_t tmp = clamp(abs_coeff +
+                          ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
+                          INT32_MIN, INT32_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+               quant_shift_ptr[rc != 0]) >> 15;
+
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+      if (tmp)
+        eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
new file mode 100644
index 00000000000..987729f962c
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
@@ -0,0 +1,1055 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 15
+                     times  8 dw  1
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 13
+                     times  8 dw  3
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 11
+                     times  8 dw  5
+                     times  8 dw 10
+                     times  8 dw  6
+                     times  8 dw  9
+                     times  8 dw  7
+                     times 16 dw  8
+                     times  8 dw  7
+                     times  8 dw  9
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  5
+                     times  8 dw 11
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  3
+                     times  8 dw 13
+                     times  8 dw  2
+                     times  8 dw 14
+                     times  8 dw  1
+                     times  8 dw 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  mova                 %4, %3       ; make copies to manipulate to calc sum
+  mova                 %2, %1       ; use originals for calc sse
+  pmaddwd              %3, %3
+  paddw                %4, %2
+  pmaddwd              %1, %1
+  movhlps              %2, %4
+  paddd                %6, %3
+  paddw                %4, %2
+  pxor                 %2, %2
+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
+  punpcklwd            %4, %2       ; sign-extend word to dword
+  paddd                %6, %1
+  paddd                %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  movhlps              m3, m7
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  pshufd               m4, m6, 0x1
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  movd               [r1], m7           ; store sse
+  movd                rax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  lea                srcq, [srcq + src_stridemp*2]
+%else
+  lea                srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro INC_SRC_BY_SRC_2STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  lea                srcq, [srcq + src_stridemp*4]
+%else
+  lea                srcq, [srcq + src_strideq*4]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define h heightd
+  %define bilin_filter sseq
+%else
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define h dword heightm
+      %define sec_str sec_stridemp
+
+      ; Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                x_offset, y_offset, dst, dst_stride, height, \
+                                sse, g_bilin_filter, g_pw_8
+      %define h heightd
+
+      ; Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define h heightd
+      %define sec_str sec_strideq
+      %else
+      %define h dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                              x_offset, y_offset, dst, dst_stride, height, sse
+      %define h heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+
+%if %1 < 16
+  sar                   h, 1
+%endif
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + 16]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+16]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + src_strideq*2]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pavgw                m0, m1
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m4, m1
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + 16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m1, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m2, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ; x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m1, filter_rnd
+  paddw                m1, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m1, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m4, filter_rnd
+  paddw                m4, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m4, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2+2]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m1, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m2, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%ifdef PIC
+  lea        bilin_filter, [bilin_filter_m]
+%endif
+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+  shl           y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [bilin_filter+y_offsetq]
+  mova                m11, [bilin_filter+y_offsetq+16]
+  mova                m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%endif
+; end of load filter
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  movu                 m1, [srcq+16]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+16]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m1, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m4, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m4, m3
+  psrlw                m0, 4
+  psrlw                m4, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  INC_SRC_BY_SRC_2STRIDE
+  lea                dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   h
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
new file mode 100644
index 00000000000..821dd0660bc
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
@@ -0,0 +1,313 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp9_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
+sym(vp9_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp9_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
+sym(vp9_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c
new file mode 100644
index 00000000000..4bc3e7e2d15
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_highbd_variance_sse2.c
@@ -0,0 +1,580 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_variance_sse2(const uint16_t *src, int src_stride,
+                                 const uint16_t *ref, int ref_stride,
+                                 int w, int h, uint32_t *sse, int *sum,
+                                 high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                       const uint8_t *ref8, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+} \
+\
+void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vp9_highbd_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                       vp9_highbd_calc##block_size##x##block_size##var_sse2, \
+                       block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_10_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_12_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                       sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                    const uint8_t *ref8, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                       sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+                                               ptrdiff_t src_stride, \
+                                               int x_offset, int y_offset, \
+                                               const uint16_t *dst, \
+                                               ptrdiff_t dst_stride, \
+                                               int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+                                                        int src_stride, \
+                                                        int x_offset, \
+                                                        int y_offset, \
+                                                        const uint8_t *dst8, \
+                                                        int dst_stride, \
+                                                        uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, h, \
+                                                       &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 48, src_stride, x_offset, y_offset, \
+          dst + 48, dst_stride, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, \
+                                                       h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 48, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+        src + (start_row * src_stride), src_stride, \
+        x_offset, y_offset, dst + (start_row * dst_stride), \
+        dst_stride, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 16 + (start_row * src_stride), src_stride, \
+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+          dst_stride, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 32 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 48 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      }\
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+                                                   ptrdiff_t src_stride, \
+                                                   int x_offset, int y_offset, \
+                                                   const uint16_t *dst, \
+                                                   ptrdiff_t dst_stride, \
+                                                   const uint16_t *sec, \
+                                                   ptrdiff_t sec_stride, \
+                                                   int height, \
+                                                   unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+               src, src_stride, x_offset, \
+               y_offset, dst, dst_stride, sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                  src + 16, src_stride, x_offset, y_offset, \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32, src_stride, x_offset, y_offset, \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48, src_stride, x_offset, y_offset, \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, x_offset, \
+                y_offset, dst + (start_row * dst_stride), dst_stride, \
+                sec + (start_row * w), w, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 16 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 16 + (start_row * dst_stride), dst_stride, \
+                sec + 16 + (start_row * w), w, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 32 + (start_row * dst_stride), dst_stride, \
+                sec + 32 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 48 + (start_row * dst_stride), dst_stride, \
+                sec + 48 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      } \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index 7c1c8843cdf..00abd3c4962 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -18,7 +18,7 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
                          const int16_t* round_ptr, const int16_t* quant_ptr,
                          const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
                          int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
-                         int zbin_oq_value, uint16_t* eob_ptr,
+                         uint16_t* eob_ptr,
                          const int16_t* scan_ptr,
                          const int16_t* iscan_ptr) {
   __m128i zero;
@@ -39,13 +39,10 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
 
       // Setup global values
       {
-        __m128i zbin_oq;
         __m128i pw_1;
-        zbin_oq = _mm_set1_epi16(zbin_oq_value);
         zbin = _mm_load_si128((const __m128i*)zbin_ptr);
         round = _mm_load_si128((const __m128i*)round_ptr);
         quant = _mm_load_si128((const __m128i*)quant_ptr);
-        zbin = _mm_add_epi16(zbin, zbin_oq);
         pw_1 = _mm_set1_epi16(1);
         zbin = _mm_sub_epi16(zbin, pw_1);
         dequant = _mm_load_si128((const __m128i*)dequant_ptr);
@@ -223,3 +220,199 @@ void vp9_quantize_b_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
     *eob_ptr = 0;
   }
 }
+
+void vp9_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t* zbin_ptr,
+                          const int16_t* round_ptr, const int16_t* quant_ptr,
+                          const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
+                          int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
+                          uint16_t* eob_ptr,
+                          const int16_t* scan_ptr,
+                          const int16_t* iscan_ptr) {
+  __m128i zero;
+  __m128i thr;
+  int16_t nzflag;
+  (void)scan_ptr;
+  (void)zbin_ptr;
+  (void)quant_shift_ptr;
+
+  coeff_ptr += n_coeffs;
+  iscan_ptr += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+  zero = _mm_setzero_si128();
+
+  if (!skip_block) {
+    __m128i eob;
+    __m128i round, quant, dequant;
+    {
+      __m128i coeff0, coeff1;
+
+      // Setup global values
+      {
+        round = _mm_load_si128((const __m128i*)round_ptr);
+        quant = _mm_load_si128((const __m128i*)quant_ptr);
+        dequant = _mm_load_si128((const __m128i*)dequant_ptr);
+      }
+
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+        // Do DC and first 15 AC
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+        round = _mm_unpackhi_epi64(round, round);
+        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+        quant = _mm_unpackhi_epi64(quant, quant);
+        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+        // Reinsert signs
+        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+        dequant = _mm_unpackhi_epi64(dequant, dequant);
+        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+      }
+
+      {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob = _mm_max_epi16(eob, eob1);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    thr = _mm_srai_epi16(dequant, 1);
+
+    // AC only loop
+    while (n_coeffs < 0) {
+      __m128i coeff0, coeff1;
+      {
+        __m128i coeff0_sign, coeff1_sign;
+        __m128i qcoeff0, qcoeff1;
+        __m128i qtmp0, qtmp1;
+
+        coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
+        coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
+
+        // Poor man's sign extract
+        coeff0_sign = _mm_srai_epi16(coeff0, 15);
+        coeff1_sign = _mm_srai_epi16(coeff1, 15);
+        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
+      }
+
+      if (nzflag) {
+        // Scan for eob
+        __m128i zero_coeff0, zero_coeff1;
+        __m128i nzero_coeff0, nzero_coeff1;
+        __m128i iscan0, iscan1;
+        __m128i eob0, eob1;
+        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+        iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
+        iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
+        // Add one to convert from indices to counts
+        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+        eob0 = _mm_max_epi16(eob0, eob1);
+        eob = _mm_max_epi16(eob, eob0);
+      }
+      n_coeffs += 8 * 2;
+    }
+
+    // Accumulate EOB
+    {
+      __m128i eob_shuffled;
+      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+      eob = _mm_max_epi16(eob, eob_shuffled);
+      *eob_ptr = _mm_extract_epi16(eob, 1);
+    }
+  } else {
+    do {
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+      _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+      n_coeffs += 8 * 2;
+    } while (n_coeffs < 0);
+    *eob_ptr = 0;
+  }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 508e1d4f55a..449d52b22e7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -15,9 +15,10 @@ pw_1: times 8 dw 1
 
 SECTION .text
 
+; TODO(yunqingwang)fix quantize_b code for skip=1 case.
 %macro QUANTIZE_FN 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                shift, qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
@@ -29,13 +30,9 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   movifnidn                    zbinq, zbinmp
   movifnidn                   roundq, roundmp
   movifnidn                   quantq, quantmp
-  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
   mova                            m0, [zbinq]              ; m0 = zbin
-  punpcklwd                       m4, m4
   mova                            m1, [roundq]             ; m1 = round
-  pshufd                          m4, m4, 0
   mova                            m2, [quantq]             ; m2 = quant
-  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
 %ifidn %1, b_32x32
   pcmpeqw                         m5, m5
   psrlw                           m5, 15
@@ -55,7 +52,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psllw                           m4, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
   lea                         coeffq, [  coeffq+ncoeffq*2]
   lea                         iscanq, [  iscanq+ncoeffq*2]
   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
@@ -122,8 +119,8 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
 %ifidn %1, b_32x32
-  pmovmskb                        r6, m7
-  pmovmskb                        r2, m12
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
   or                              r6, r2
   jz .skip_iter
 %endif
@@ -220,7 +217,7 @@ QUANTIZE_FN b_32x32, 7
 
 %macro QUANTIZE_FP 2
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                                shift, qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
@@ -248,11 +245,11 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   psllw                           m2, 1
 %endif
   pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
+
   lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
+  lea                            r5q, [  r5q+ncoeffq*2]
+  lea                            r3q, [ r3q+ncoeffq*2]
+  lea                            r4q, [r4q+ncoeffq*2]
   neg                        ncoeffq
 
   ; get DC and first 15 AC coeffs
@@ -270,28 +267,30 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
+  mova            [r3q+ncoeffq*2+ 0], m8
+  mova            [r3q+ncoeffq*2+16], m13
 %ifidn %1, fp_32x32
   pabsw                           m8, m8
   pabsw                          m13, m13
 %endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
   punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
 %ifidn %1, fp_32x32
   psrlw                           m8, 1
   psrlw                          m13, 1
   psignw                          m8, m9
   psignw                         m13, m10
   psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
 %endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
+  mova            [r4q+ncoeffq*2+ 0], m8
+  mova            [r4q+ncoeffq*2+16], m13
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
   psubw                           m6, m7                   ; m6 = scan[i] + 1
   psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                           m8, m6                   ; m8 = max(eob)
@@ -305,15 +304,15 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-%ifidn %1, fp_32x32
+
   pcmpgtw                         m7, m6,  m0
   pcmpgtw                        m12, m11, m0
-  pmovmskb                        r6, m7
-  pmovmskb                        r2, m12
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
 
   or                              r6, r2
   jz .skip_iter
-%endif
+
   pcmpeqw                         m7, m7
 
   paddsw                          m6, m1                   ; m6 += round
@@ -322,26 +321,26 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
+  mova            [r3q+ncoeffq*2+ 0], m14
+  mova            [r3q+ncoeffq*2+16], m13
 %ifidn %1, fp_32x32
   pabsw                          m14, m14
   pabsw                          m13, m13
 %endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
+  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
 %ifidn %1, fp_32x32
   psrlw                          m14, 1
   psrlw                          m13, 1
   psignw                         m14, m9
   psignw                         m13, m10
 %endif
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
+  mova            [r4q+ncoeffq*2+ 0], m14
+  mova            [r4q+ncoeffq*2+16], m13
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
   psubw                           m6, m7                   ; m6 = scan[i] + 1
   psubw                          m11, m7                   ; m11 = scan[i] + 1
   pandn                          m14, m6                   ; m14 = max(eob)
@@ -351,16 +350,14 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 
-%ifidn %1, fp_32x32
   jmp .accumulate_eob
 .skip_iter:
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
+  mova            [r3q+ncoeffq*2+ 0], m5
+  mova            [r3q+ncoeffq*2+16], m5
+  mova            [r4q+ncoeffq*2+ 0], m5
+  mova            [r4q+ncoeffq*2+16], m5
   add                        ncoeffq, mmsize
   jl .ac_only_loop
-%endif
 
 .accumulate_eob:
   ; horizontally accumulate/max eobs and write into [eob] memory pointer
@@ -372,7 +369,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
   pextrw                          r6, m8, 0
-  mov                             [r2], r6
+  mov                           [r2], r6
   RET
 
   ; skip-block, i.e. just write all zeroes
@@ -381,19 +378,19 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
   movifnidn                  ncoeffq, ncoeffmp
   mov                             r2, qcoeffmp
   mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
+
+  lea                            r0q, [r0q+ncoeffq*2]
+  lea                            r2q, [r2q+ncoeffq*2]
   neg                        ncoeffq
   pxor                            m7, m7
 .blank_loop:
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
+  mova            [r0q+ncoeffq*2+ 0], m7
+  mova            [r0q+ncoeffq*2+16], m7
+  mova            [r2q+ncoeffq*2+ 0], m7
+  mova            [r2q+ncoeffq*2+16], m7
   add                        ncoeffq, mmsize
   jl .blank_loop
-  mov                    word [eobq], 0
+  mov                     word [r3q], 0
   RET
 %endmacro
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm
deleted file mode 100644
index 0cb35424ed6..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm
+++ /dev/null
@@ -1,370 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-;void int vp9_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x16x3_ssse3) PRIVATE
-sym(vp9_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vp9_sad16x16x3_ssse3_skiptable
-.vp9_sad16x16x3_ssse3_jumptable:
-        dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
-        dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_skiptable:
-
-        call .vp9_sad16x16x3_ssse3_do_jump
-.vp9_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
-
-.vp9_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vp9_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vp9_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-global sym(vp9_sad16x8x3_ssse3) PRIVATE
-sym(vp9_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vp9_sad16x8x3_ssse3_skiptable
-.vp9_sad16x8x3_ssse3_jumptable:
-        dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
-        dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_skiptable:
-
-        call .vp9_sad16x8x3_ssse3_do_jump
-.vp9_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
-
-.vp9_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vp9_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
index 1a9e4e8b6bd..06b8b034a5e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -101,7 +101,7 @@ SECTION .text
   pshufd               m4, m6, 0x1
   movd               [r1], m7           ; store sse
   paddd                m6, m4
-  movd                rax, m6           ; store sum as return value
+  movd               raxd, m6           ; store sum as return value
 %else ; mmsize == 8
   pshufw               m4, m6, 0xe
   pshufw               m3, m7, 0xe
@@ -113,7 +113,7 @@ SECTION .text
   movd               [r1], m7           ; store sse
   pshufw               m4, m6, 0xe
   paddd                m6, m4
-  movd                rax, m6           ; store sum as return value
+  movd               raxd, m6           ; store sum as return value
 %endif
   RET
 %endmacro
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
index b4d2b0ac408..8490bbbdc2e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_variance_sse2.c
@@ -314,13 +314,15 @@ unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
   return *sse;
 }
 
+// The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
                                         ptrdiff_t src_stride, \
                                         int x_offset, int y_offset, \
                                         const uint8_t *dst, \
                                         ptrdiff_t dst_stride, \
-                                        int height, unsigned int *sse)
+                                        int height, unsigned int *sse, \
+                                        void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
 DECL(4, opt2); \
 DECL(8, opt1); \
@@ -342,26 +344,26 @@ unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
   unsigned int sse; \
   int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
                                                 y_offset, dst, dst_stride, \
-                                                h, &sse); \
+                                                h, &sse, NULL, NULL); \
   if (w > wf) { \
     unsigned int sse2; \
     int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
                                                    x_offset, y_offset, \
                                                    dst + 16, dst_stride, \
-                                                   h, &sse2); \
+                                                   h, &sse2, NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
                                                  x_offset, y_offset, \
                                                  dst + 32, dst_stride, \
-                                                 h, &sse2); \
+                                                 h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
                                                  x_offset, y_offset, \
                                                  dst + 48, dst_stride, \
-                                                 h, &sse2); \
+                                                 h, &sse2, NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
@@ -391,6 +393,7 @@ FNS(ssse3, ssse3);
 #undef FNS
 #undef FN
 
+// The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
                                             ptrdiff_t src_stride, \
@@ -399,7 +402,8 @@ int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
                                             ptrdiff_t dst_stride, \
                                             const uint8_t *sec, \
                                             ptrdiff_t sec_stride, \
-                                            int height, unsigned int *sse)
+                                            int height, unsigned int *sse, \
+                                            void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
 DECL(4, opt2); \
 DECL(8, opt1); \
@@ -422,26 +426,30 @@ unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
   unsigned int sse; \
   int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
                                                     y_offset, dst, dst_stride, \
-                                                    sec, w, h, &sse); \
+                                                    sec, w, h, &sse, NULL, \
+                                                    NULL); \
   if (w > wf) { \
     unsigned int sse2; \
     int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
                                                        x_offset, y_offset, \
                                                        dst + 16, dst_stride, \
-                                                       sec + 16, w, h, &sse2); \
+                                                       sec + 16, w, h, &sse2, \
+                                                       NULL, NULL); \
     se += se2; \
     sse += sse2; \
     if (w > wf * 2) { \
       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
                                                      x_offset, y_offset, \
                                                      dst + 32, dst_stride, \
-                                                     sec + 32, w, h, &sse2); \
+                                                     sec + 32, w, h, &sse2, \
+                                                     NULL, NULL); \
       se += se2; \
       sse += sse2; \
       se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
                                                      x_offset, y_offset, \
                                                      dst + 48, dst_stride, \
-                                                     sec + 48, w, h, &sse2); \
+                                                     sec + 48, w, h, &sse2, \
+                                                     NULL, NULL); \
       se += se2; \
       sse += sse2; \
     } \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk
index 9414120f64e..c9326eeea69 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_common.mk
@@ -33,6 +33,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.h
 VP9_COMMON_SRCS-yes += common/vp9_mv.h
 VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h
 VP9_COMMON_SRCS-yes += common/vp9_pred_common.h
@@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.h
 VP9_COMMON_SRCS-yes += common/vp9_tile_common.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c
+VP9_COMMON_SRCS-yes += common/vp9_thread_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
@@ -72,6 +74,8 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
+VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_bilinear_sse2.asm
@@ -80,6 +84,7 @@ VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_bilinear_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_subpixel_8t_intrin_avx2.c
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
@@ -124,35 +129,76 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
+# common (msa)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_horiz_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_vert_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_avg_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_copy_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve_msa.h
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c
+
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_intrin_ssse3.c
 ifeq ($(ARCH_X86_64), yes)
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm
 endif
 
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct8x8_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct16x16_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_1_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_idct32x32_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht4x4_add_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_iht8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_copy_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
-VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_reconintra_neon$(ASM)
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+endif
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
+
+# neon with assembly and intrinsics implementations. If both are available
+# prefer assembly.
+ifeq ($(HAVE_NEON_ASM), yes)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM)
+else
+ifeq ($(HAVE_NEON), yes)
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_avg_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_copy_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct32x32_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c
+# TODO(johannkoenig): re-enable when chromium build is fixed
+# # https://code.google.com/p/chromium/issues/detail?id=443839
+#VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c
+VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c
+endif  # HAVE_NEON
+endif  # HAVE_NEON_ASM
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
index d0ca5242ccb..cba15e693e9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c
@@ -12,7 +12,8 @@
 #include <string.h>
 
 #include "./vpx_config.h"
-#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -33,12 +34,15 @@ struct vp9_extracfg {
   vp8e_tuning                 tuning;
   unsigned int                cq_level;  // constrained quality level
   unsigned int                rc_max_intra_bitrate_pct;
+  unsigned int                rc_max_inter_bitrate_pct;
+  unsigned int                gf_cbr_boost_pct;
   unsigned int                lossless;
   unsigned int                frame_parallel_decoding_mode;
   AQ_MODE                     aq_mode;
   unsigned int                frame_periodic_boost;
   vpx_bit_depth_t             bit_depth;
   vp9e_tune_content           content;
+  vpx_color_space_t           color_space;
 };
 
 static struct vp9_extracfg default_extra_cfg = {
@@ -47,19 +51,22 @@ static struct vp9_extracfg default_extra_cfg = {
   0,                          // noise_sensitivity
   0,                          // sharpness
   0,                          // static_thresh
-  0,                          // tile_columns
+  6,                          // tile_columns
   0,                          // tile_rows
   7,                          // arnr_max_frames
   5,                          // arnr_strength
   VP8_TUNE_PSNR,              // tuning
   10,                         // cq_level
   0,                          // rc_max_intra_bitrate_pct
+  0,                          // rc_max_inter_bitrate_pct
+  0,                          // gf_cbr_boost_pct
   0,                          // lossless
-  0,                          // frame_parallel_decoding_mode
+  1,                          // frame_parallel_decoding_mode
   NO_AQ,                      // aq_mode
   0,                          // frame_periodic_delta_q
   VPX_BITS_8,                 // Bit depth
-  VP9E_CONTENT_DEFAULT        // content
+  VP9E_CONTENT_DEFAULT,       // content
+  VPX_CS_UNKNOWN,             // color space
 };
 
 struct vpx_codec_alg_priv {
@@ -76,9 +83,13 @@ struct vpx_codec_alg_priv {
   size_t                  pending_frame_sizes[8];
   size_t                  pending_frame_magnitude;
   vpx_image_t             preview_img;
+  vpx_enc_frame_flags_t   next_frame_flags;
   vp8_postproc_cfg_t      preview_ppcfg;
   vpx_codec_pkt_list_decl(256) pkt_list;
   unsigned int                 fixed_kf_cntr;
+  vpx_codec_priv_output_cx_pkt_cb_pair_t output_cx_pkt_cb;
+  // BufferPool that holds all reference frames.
+  BufferPool              *buffer_pool;
 };
 
 static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
@@ -147,8 +158,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, g_threads,          64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
   RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_Q);
-  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  1000);
-  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   1000);
+  RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
+  RANGE_CHECK_HI(cfg, rc_overshoot_pct,   100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
   RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
   RANGE_CHECK_BOOL(cfg,                   rc_resize_allowed);
@@ -158,8 +169,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
 
   if (cfg->rc_resize_allowed == 1) {
-    RANGE_CHECK(cfg, rc_scaled_width, 1, cfg->g_w);
-    RANGE_CHECK(cfg, rc_scaled_height, 1, cfg->g_h);
+    RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
+    RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
   }
 
   RANGE_CHECK(cfg, ss_number_layers, 1, VPX_SS_MAX_LAYERS);
@@ -202,8 +213,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
     ERROR("kf_min_dist not supported in auto mode, use 0 "
           "or kf_max_dist instead.");
 
-  RANGE_CHECK_BOOL(extra_cfg,  enable_auto_alt_ref);
-  RANGE_CHECK(extra_cfg, cpu_used, -16, 16);
+  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -288,7 +299,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
       cfg->g_bit_depth == VPX_BITS_8) {
     ERROR("Codec bit-depth 8 not supported in profile > 1");
   }
-
+  RANGE_CHECK(extra_cfg, color_space, VPX_CS_UNKNOWN, VPX_CS_SRGB);
   return VPX_CODEC_OK;
 }
 
@@ -345,11 +356,12 @@ static int get_image_bps(const vpx_image_t *img) {
 }
 
 static vpx_codec_err_t set_encoder_config(
-    VP9EncoderConfig *oxcf,
-    const vpx_codec_enc_cfg_t *cfg,
-    const struct vp9_extracfg *extra_cfg) {
+  VP9EncoderConfig *oxcf,
+  const vpx_codec_enc_cfg_t *cfg,
+  const struct vp9_extracfg *extra_cfg) {
   const int is_vbr = cfg->rc_end_usage == VPX_VBR;
   oxcf->profile = cfg->g_profile;
+  oxcf->max_threads = (int)cfg->g_threads;
   oxcf->width   = cfg->g_w;
   oxcf->height  = cfg->g_h;
   oxcf->bit_depth = cfg->g_bit_depth;
@@ -380,6 +392,8 @@ static vpx_codec_err_t set_encoder_config(
   // Convert target bandwidth from Kbit/s to Bit/s
   oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
   oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+  oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+  oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
 
   oxcf->best_allowed_q =
       extra_cfg->lossless ? 0 : vp9_quantizer_to_qindex(cfg->rc_min_quantizer);
@@ -391,9 +405,15 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->under_shoot_pct         = cfg->rc_undershoot_pct;
   oxcf->over_shoot_pct          = cfg->rc_overshoot_pct;
 
-  oxcf->allow_spatial_resampling = cfg->rc_resize_allowed;
-  oxcf->scaled_frame_width       = cfg->rc_scaled_width;
-  oxcf->scaled_frame_height      = cfg->rc_scaled_height;
+  oxcf->scaled_frame_width  = cfg->rc_scaled_width;
+  oxcf->scaled_frame_height = cfg->rc_scaled_height;
+  if (cfg->rc_resize_allowed == 1) {
+    oxcf->resize_mode =
+        (oxcf->scaled_frame_width == 0 || oxcf->scaled_frame_height == 0) ?
+            RESIZE_DYNAMIC : RESIZE_FIXED;
+  } else {
+    oxcf->resize_mode = RESIZE_NONE;
+  }
 
   oxcf->maximum_buffer_size_ms   = is_vbr ? 240000 : cfg->rc_buf_sz;
   oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
@@ -412,7 +432,7 @@ static vpx_codec_err_t set_encoder_config(
 
   oxcf->speed                  =  abs(extra_cfg->cpu_used);
   oxcf->encode_breakout        =  extra_cfg->static_thresh;
-  oxcf->play_alternate         =  extra_cfg->enable_auto_alt_ref;
+  oxcf->enable_auto_arf        =  extra_cfg->enable_auto_alt_ref;
   oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
   oxcf->sharpness              =  extra_cfg->sharpness;
 
@@ -422,6 +442,7 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->firstpass_mb_stats_in  = cfg->rc_firstpass_mb_stats_in;
 #endif
 
+  oxcf->color_space = extra_cfg->color_space;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
 
@@ -445,13 +466,13 @@ static vpx_codec_err_t set_encoder_config(
     for (i = 0; i < VPX_SS_MAX_LAYERS; ++i) {
       oxcf->ss_target_bitrate[i] =  1000 * cfg->ss_target_bitrate[i];
 #if CONFIG_SPATIAL_SVC
-      oxcf->ss_play_alternate[i] =  cfg->ss_enable_auto_alt_ref[i];
+      oxcf->ss_enable_auto_arf[i] =  cfg->ss_enable_auto_alt_ref[i];
 #endif
     }
   } else if (oxcf->ss_number_layers == 1) {
     oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
 #if CONFIG_SPATIAL_SVC
-    oxcf->ss_play_alternate[0] = extra_cfg->enable_auto_alt_ref;
+    oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref;
 #endif
   }
 
@@ -493,7 +514,7 @@ static vpx_codec_err_t set_encoder_config(
   printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
   printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
   printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
-  printf("play_alternate: %d\n", oxcf->play_alternate);
+  printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
   printf("Version: %d\n", oxcf->Version);
   printf("encode_breakout: %d\n", oxcf->encode_breakout);
   printf("error resilient: %d\n", oxcf->error_resilient_mode);
@@ -506,9 +527,16 @@ static vpx_codec_err_t set_encoder_config(
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t  *cfg) {
   vpx_codec_err_t res;
-
-  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h)
-    ERROR("Cannot change width or height after initialization");
+  int force_key = 0;
+
+  if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+    if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
+      ERROR("Cannot change width or height after initialization");
+    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+        (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+      force_key = 1;
+  }
 
   // Prevent increasing lag_in_frames. This check is stricter than it needs
   // to be -- the limit is not increasing past the first lag_in_frames
@@ -522,9 +550,14 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     ctx->cfg = *cfg;
     set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+    // On profile change, request a key frame
+    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
     vp9_change_config(ctx->cpi, &ctx->oxcf);
   }
 
+  if (force_key)
+    ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
+
   return res;
 }
 
@@ -649,6 +682,22 @@ static vpx_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.rc_max_inter_bitrate_pct =
+      CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.gf_cbr_boost_pct =
+      CAST(VP9E_SET_GF_CBR_BOOST_PCT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_lossless(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -691,6 +740,16 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
     ctx->priv->enc.total_encoders = 1;
+    priv->buffer_pool =
+        (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+    if (priv->buffer_pool == NULL)
+      return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
 
     if (ctx->config.enc) {
       // Update the reference to the config structure to an internal copy.
@@ -699,7 +758,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     }
 
     priv->extra_cfg = default_extra_cfg;
-    vp9_initialize_enc();
+    once(vp9_initialize_enc);
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
@@ -709,7 +768,7 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
       priv->oxcf.use_highbitdepth =
           (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
 #endif
-      priv->cpi = vp9_create_compressor(&priv->oxcf);
+      priv->cpi = vp9_create_compressor(&priv->oxcf, priv->buffer_pool);
       if (priv->cpi == NULL)
         res = VPX_CODEC_MEM_ERROR;
       else
@@ -723,6 +782,10 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
   vp9_remove_compressor(ctx->cpi);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+  vpx_free(ctx->buffer_pool);
   vpx_free(ctx);
   return VPX_CODEC_OK;
 }
@@ -861,22 +924,26 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
   vpx_codec_err_t res = VPX_CODEC_OK;
   VP9_COMP *const cpi = ctx->cpi;
   const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
+  size_t data_sz;
 
   if (img != NULL) {
     res = validate_img(ctx, img);
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
-    if (res == VPX_CODEC_OK && cpi != NULL && ctx->cx_data == NULL) {
+    if (res == VPX_CODEC_OK && cpi != NULL) {
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
-      ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
-                        get_image_bps(img) / 8 *
-                        (cpi->multi_arf_allowed ? 8 : 2);
-      if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
-
-      ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
-      if (ctx->cx_data == NULL) {
-        return VPX_CODEC_MEM_ERROR;
+      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
+                (cpi->multi_arf_allowed ? 8 : 2);
+      if (data_sz < 4096)
+        data_sz = 4096;
+      if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+        ctx->cx_data_sz = data_sz;
+        free(ctx->cx_data);
+        ctx->cx_data = (unsigned char*)malloc(ctx->cx_data_sz);
+        if (ctx->cx_data == NULL) {
+          return VPX_CODEC_MEM_ERROR;
+        }
       }
     }
   }
@@ -921,10 +988,11 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
 
       // Store the original flags in to the frame buffer. Will extract the
       // key frame flag when we actually encode this frame.
-      if (vp9_receive_raw_frame(cpi, flags,
+      if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags,
                                 &sd, dst_time_stamp, dst_end_time_stamp)) {
         res = update_error_state(ctx, &cpi->common.error);
       }
+      ctx->next_frame_flags = 0;
     }
 
     cx_data = ctx->cx_data;
@@ -972,6 +1040,24 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
           ctx->pending_frame_magnitude |= size;
           cx_data += size;
           cx_data_sz -= size;
+
+          if (ctx->output_cx_pkt_cb.output_cx_pkt) {
+            pkt.kind = VPX_CODEC_CX_FRAME_PKT;
+            pkt.data.frame.pts = ticks_to_timebase_units(timebase,
+                                                         dst_time_stamp);
+            pkt.data.frame.duration =
+               (unsigned long)ticks_to_timebase_units(timebase,
+                   dst_end_time_stamp - dst_time_stamp);
+            pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+            pkt.data.frame.buf = ctx->pending_cx_data;
+            pkt.data.frame.sz  = size;
+            ctx->pending_cx_data = NULL;
+            ctx->pending_cx_data_sz = 0;
+            ctx->pending_frame_count = 0;
+            ctx->pending_frame_magnitude = 0;
+            ctx->output_cx_pkt_cb.output_cx_pkt(
+                &pkt, ctx->output_cx_pkt_cb.user_priv);
+          }
           continue;
         }
 
@@ -987,7 +1073,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
           ctx->pending_frame_magnitude |= size;
           ctx->pending_cx_data_sz += size;
-          size += write_superframe_index(ctx);
+          // write the superframe only for the case when
+          if (!ctx->output_cx_pkt_cb.output_cx_pkt)
+            size += write_superframe_index(ctx);
           pkt.data.frame.buf = ctx->pending_cx_data;
           pkt.data.frame.sz  = ctx->pending_cx_data_sz;
           ctx->pending_cx_data = NULL;
@@ -999,11 +1087,16 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
           pkt.data.frame.sz  = size;
         }
         pkt.data.frame.partition_id = -1;
-        vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+        if(ctx->output_cx_pkt_cb.output_cx_pkt)
+          ctx->output_cx_pkt_cb.output_cx_pkt(&pkt, ctx->output_cx_pkt_cb.user_priv);
+        else
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
         cx_data += size;
         cx_data_sz -= size;
 #if CONFIG_SPATIAL_SVC
-        if (is_two_pass_svc(cpi)) {
+        if (is_two_pass_svc(cpi) && !ctx->output_cx_pkt_cb.output_cx_pkt) {
           vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
           int i;
           vp9_zero(pkt_sizes);
@@ -1016,7 +1109,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
             pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt;
             lc->layer_size = 0;
           }
+
           vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+
           vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
         }
 #endif
@@ -1165,6 +1260,21 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
   }
 }
 
+static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+  if (map) {
+    if (!vp9_get_active_map(ctx->cpi, map->active_map,
+                            (int)map->rows, (int)map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
@@ -1203,7 +1313,9 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
   VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
   SVC *const svc = &cpi->svc;
 
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
   svc->spatial_layer_id = data->spatial_layer_id;
+#endif
   svc->temporal_layer_id = data->temporal_layer_id;
   // Checks on valid layer_id input.
   if (svc->temporal_layer_id < 0 ||
@@ -1217,6 +1329,20 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_svc_layer_id(vpx_codec_alg_priv_t *ctx,
+                                             va_list args) {
+  vpx_svc_layer_id_t *data = va_arg(args, vpx_svc_layer_id_t *);
+  VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
+  SVC *const svc = &cpi->svc;
+
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+  data->spatial_layer_id = svc->spatial_layer_id;
+#endif
+  data->temporal_layer_id = svc->temporal_layer_id;
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
                                                va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
@@ -1235,6 +1361,16 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
+                                                 va_list args) {
+  vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
+      (vpx_codec_priv_output_cx_pkt_cb_pair_t *)va_arg(args, void *);
+  ctx->output_cx_pkt_cb.output_cx_pkt = cbp->output_cx_pkt;
+  ctx->output_cx_pkt_cb.user_priv = cbp->user_priv;
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
                                              va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -1242,6 +1378,13 @@ static vpx_codec_err_t ctrl_set_tune_content(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_color_space(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.color_space = CAST(VP9E_SET_COLOR_SPACE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
   {VP8E_UPD_ENTROPY,                  ctrl_update_entropy},
@@ -1266,20 +1409,30 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8E_SET_TUNING,                   ctrl_set_tuning},
   {VP8E_SET_CQ_LEVEL,                 ctrl_set_cq_level},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    ctrl_set_rc_max_intra_bitrate_pct},
+  {VP9E_SET_MAX_INTER_BITRATE_PCT,    ctrl_set_rc_max_inter_bitrate_pct},
+  {VP9E_SET_GF_CBR_BOOST_PCT,         ctrl_set_rc_gf_cbr_boost_pct},
   {VP9E_SET_LOSSLESS,                 ctrl_set_lossless},
   {VP9E_SET_FRAME_PARALLEL_DECODING,  ctrl_set_frame_parallel_decoding_mode},
   {VP9E_SET_AQ_MODE,                  ctrl_set_aq_mode},
   {VP9E_SET_FRAME_PERIODIC_BOOST,     ctrl_set_frame_periodic_boost},
   {VP9E_SET_SVC,                      ctrl_set_svc},
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
   {VP9E_SET_SVC_PARAMETERS,           ctrl_set_svc_parameters},
+  {VP9E_REGISTER_CX_CALLBACK,         ctrl_register_cx_callback},
+#endif
   {VP9E_SET_SVC_LAYER_ID,             ctrl_set_svc_layer_id},
   {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
+  {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
   {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
   {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_quantizer64},
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+  {VP9E_GET_SVC_LAYER_ID,             ctrl_get_svc_layer_id},
+#endif
+  {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
 
   { -1, NULL},
 };
@@ -1289,7 +1442,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
     0,
     {  // NOLINT
       0,                  // g_usage
-      0,                  // g_threads
+      8,                  // g_threads
       0,                  // g_profile
 
       320,                // g_width
@@ -1307,21 +1460,19 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
 
       0,                  // rc_dropframe_thresh
       0,                  // rc_resize_allowed
-      1,                  // rc_scaled_width
-      1,                  // rc_scaled_height
+      0,                  // rc_scaled_width
+      0,                  // rc_scaled_height
       60,                 // rc_resize_down_thresold
       30,                 // rc_resize_up_thresold
 
       VPX_VBR,            // rc_end_usage
-#if VPX_ENCODER_ABI_VERSION > (1 + VPX_CODEC_ABI_VERSION)
       {NULL, 0},          // rc_twopass_stats_in
       {NULL, 0},          // rc_firstpass_mb_stats_in
-#endif
       256,                // rc_target_bandwidth
       0,                  // rc_min_quantizer
       63,                 // rc_max_quantizer
-      100,                // rc_undershoot_pct
-      100,                // rc_overshoot_pct
+      25,                 // rc_undershoot_pct
+      25,                 // rc_overshoot_pct
 
       6000,               // rc_max_buffer_size
       4000,               // rc_buffer_initial_size
@@ -1344,9 +1495,6 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
       {0},                    // ts_rate_decimator
       0,                      // ts_periodicity
       {0},                    // ts_layer_id
-#if VPX_ENCODER_ABI_VERSION == (1 + VPX_CODEC_ABI_VERSION)
-      "vp8.fpf"           // first pass filename
-#endif
     }
   },
 };
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
index 85e32d351e9..ff76204d822 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c
@@ -11,13 +11,16 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "./vpx_config.h"
 #include "./vpx_version.h"
 
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_frame_buffers.h"
+#include "vp9/common/vp9_thread.h"
 
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_decodeframe.h"
@@ -29,20 +32,45 @@
 
 typedef vpx_codec_stream_info_t vp9_stream_info_t;
 
+// This limit is due to framebuffer numbers.
+// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
+#define FRAME_CACHE_SIZE 6   // Cache maximum 6 decoded frames.
+
+typedef struct cache_frame {
+  int fb_idx;
+  vpx_image_t img;
+} cache_frame;
+
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t        base;
   vpx_codec_dec_cfg_t     cfg;
   vp9_stream_info_t       si;
-  struct VP9Decoder *pbi;
   int                     postproc_cfg_set;
   vp8_postproc_cfg_t      postproc_cfg;
   vpx_decrypt_cb          decrypt_cb;
-  void                   *decrypt_state;
+  void                    *decrypt_state;
   vpx_image_t             img;
   int                     img_avail;
   int                     flushed;
   int                     invert_tile_order;
+  int                     last_show_frame;  // Index of last output frame.
+  int                     byte_alignment;
+
+  // Frame parallel related.
   int                     frame_parallel_decode;  // frame-based threading.
+  VP9Worker               *frame_workers;
+  int                     num_frame_workers;
+  int                     next_submit_worker_id;
+  int                     last_submit_worker_id;
+  int                     next_output_worker_id;
+  int                     available_threads;
+  cache_frame             frame_cache[FRAME_CACHE_SIZE];
+  int                     frame_cache_write;
+  int                     frame_cache_read;
+  int                     num_cache_frames;
+  int                     need_resync;      // wait for key/intra-only frame
+  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  BufferPool              *buffer_pool;
 
   // External frame buffer info to save for VP9 common.
   void *ext_priv;  // Private data associated with the external frame buffers.
@@ -64,13 +92,12 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
 
     ctx->priv = (vpx_codec_priv_t *)priv;
     ctx->priv->init_flags = ctx->init_flags;
-
     priv->si.sz = sizeof(priv->si);
     priv->flushed = 0;
+    // Only do frame parallel decode when threads > 1.
     priv->frame_parallel_decode =
-        (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING);
-    priv->frame_parallel_decode = 0;  // Disable for now
-
+        (ctx->config.dec && (ctx->config.dec->threads > 1) &&
+         (ctx->init_flags & VPX_CODEC_USE_FRAME_THREADING)) ? 1 : 0;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
@@ -81,24 +108,48 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
 }
 
 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->pbi) {
-    vp9_decoder_remove(ctx->pbi);
-    ctx->pbi = NULL;
+  if (ctx->frame_workers != NULL) {
+    int i;
+    for (i = 0; i < ctx->num_frame_workers; ++i) {
+      VP9Worker *const worker = &ctx->frame_workers[i];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      vp9_get_worker_interface()->end(worker);
+      vp9_remove_common(&frame_worker_data->pbi->common);
+#if CONFIG_VP9_POSTPROC
+      vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
+#endif
+      vp9_decoder_remove(frame_worker_data->pbi);
+      vpx_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+      pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+      vpx_free(frame_worker_data);
+    }
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
   }
 
-  vpx_free(ctx);
+  if (ctx->buffer_pool) {
+    vp9_free_ref_frame_buffers(ctx->buffer_pool);
+    vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+  }
 
+  vpx_free(ctx->frame_workers);
+  vpx_free(ctx->buffer_pool);
+  vpx_free(ctx);
   return VPX_CODEC_OK;
 }
 
 static int parse_bitdepth_colorspace_sampling(
     BITSTREAM_PROFILE profile, struct vp9_read_bit_buffer *rb) {
-  const int sRGB = 7;
-  int colorspace;
+  vpx_color_space_t color_space;
   if (profile >= PROFILE_2)
     rb->bit_offset += 1;  // Bit-depth 10 or 12.
-  colorspace = vp9_rb_read_literal(rb, 3);
-  if (colorspace != sRGB) {
+  color_space = (vpx_color_space_t)vp9_rb_read_literal(rb, 3);
+  if (color_space != VPX_CS_SRGB) {
     rb->bit_offset += 1;  // [16,235] (including xvycc) vs [0,255] range.
     if (profile == PROFILE_1 || profile == PROFILE_3) {
       rb->bit_offset += 2;  // subsampling x/y.
@@ -146,7 +197,11 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data,
     if (frame_marker != VP9_FRAME_MARKER)
       return VPX_CODEC_UNSUP_BITSTREAM;
 
-    if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM;
+    if (profile >= MAX_PROFILES)
+      return VPX_CODEC_UNSUP_BITSTREAM;
+
+    if ((profile >= 2 && data_sz <= 1) || data_sz < 1)
+      return VPX_CODEC_UNSUP_BITSTREAM;
 
     if (vp9_rb_read_bit(&rb)) {  // show an existing frame
       vp9_rb_read_literal(&rb, 3);  // Frame buffer to show.
@@ -206,32 +261,45 @@ static vpx_codec_err_t decoder_get_si(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static void set_error_detail(vpx_codec_alg_priv_t *ctx,
+                             const char *const error) {
+  ctx->base.err_detail = error;
+}
+
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
                            const struct vpx_internal_error_info *error) {
   if (error->error_code)
-    ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+    set_error_detail(ctx, error->has_detail ? error->detail : NULL);
 
   return error->error_code;
 }
 
 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  VP9_COMMON *const cm = &ctx->pbi->common;
+  int i;
 
-  cm->new_fb_idx = -1;
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    BufferPool *const pool = cm->buffer_pool;
 
-  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-    cm->get_fb_cb = ctx->get_ext_fb_cb;
-    cm->release_fb_cb = ctx->release_ext_fb_cb;
-    cm->cb_priv = ctx->ext_priv;
-  } else {
-    cm->get_fb_cb = vp9_get_frame_buffer;
-    cm->release_fb_cb = vp9_release_frame_buffer;
+    cm->new_fb_idx = INVALID_IDX;
+    cm->byte_alignment = ctx->byte_alignment;
 
-    if (vp9_alloc_internal_frame_buffers(&cm->int_frame_buffers))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to initialize internal frame buffers");
+    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+      pool->get_fb_cb = ctx->get_ext_fb_cb;
+      pool->release_fb_cb = ctx->release_ext_fb_cb;
+      pool->cb_priv = ctx->ext_priv;
+    } else {
+      pool->get_fb_cb = vp9_get_frame_buffer;
+      pool->release_fb_cb = vp9_release_frame_buffer;
 
-    cm->cb_priv = &cm->int_frame_buffers;
+      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to initialize internal frame buffers");
+
+      pool->cb_priv = &pool->int_frame_buffers;
+    }
   }
 }
 
@@ -250,14 +318,127 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx,
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
-static void init_decoder(vpx_codec_alg_priv_t *ctx) {
-  ctx->pbi = vp9_decoder_create();
-  if (ctx->pbi == NULL)
-    return;
+static int frame_worker_hook(void *arg1, void *arg2) {
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+  const uint8_t *data = frame_worker_data->data;
+  (void)arg2;
+
+  frame_worker_data->result =
+      vp9_receive_compressed_data(frame_worker_data->pbi,
+                                  frame_worker_data->data_size,
+                                  &data);
+  frame_worker_data->data_end = data;
+
+  if (frame_worker_data->pbi->frame_parallel_decode) {
+    // In frame parallel decoding, a worker thread must successfully decode all
+    // the compressed data.
+    if (frame_worker_data->result != 0 ||
+        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
+      VP9Worker *const worker = frame_worker_data->pbi->frame_worker_owner;
+      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
+      // Signal all the other threads that are waiting for this frame.
+      vp9_frameworker_lock_stats(worker);
+      frame_worker_data->frame_context_ready = 1;
+      lock_buffer_pool(pool);
+      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+      unlock_buffer_pool(pool);
+      frame_worker_data->pbi->need_resync = 1;
+      vp9_frameworker_signal_stats(worker);
+      vp9_frameworker_unlock_stats(worker);
+      return 0;
+    }
+  } else if (frame_worker_data->result != 0) {
+    // Check decode result in serial decode.
+    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+    frame_worker_data->pbi->need_resync = 1;
+  }
+  return !frame_worker_data->result;
+}
+
+static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  int i;
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+
+  ctx->last_show_frame = -1;
+  ctx->next_submit_worker_id = 0;
+  ctx->last_submit_worker_id = 0;
+  ctx->next_output_worker_id = 0;
+  ctx->frame_cache_read = 0;
+  ctx->frame_cache_write = 0;
+  ctx->num_cache_frames = 0;
+  ctx->need_resync = 1;
+  ctx->num_frame_workers =
+      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads: 1;
+  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+    ctx->num_frame_workers = MAX_DECODE_THREADS;
+  ctx->available_threads = ctx->num_frame_workers;
+  ctx->flushed = 0;
 
-  ctx->pbi->max_threads = ctx->cfg.threads;
-  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
-  ctx->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+  ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
+  if (ctx->buffer_pool == NULL)
+    return VPX_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+
+  ctx->frame_workers = (VP9Worker *)
+      vpx_malloc(ctx->num_frame_workers * sizeof(*ctx->frame_workers));
+  if (ctx->frame_workers == NULL) {
+    set_error_detail(ctx, "Failed to allocate frame_workers");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  for (i = 0; i < ctx->num_frame_workers; ++i) {
+    VP9Worker *const worker = &ctx->frame_workers[i];
+    FrameWorkerData *frame_worker_data = NULL;
+    winterface->init(worker);
+    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
+    if (worker->data1 == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
+    if (frame_worker_data->pbi == NULL) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data");
+      return VPX_CODEC_MEM_ERROR;
+    }
+    frame_worker_data->pbi->frame_worker_owner = worker;
+    frame_worker_data->worker_id = i;
+    frame_worker_data->scratch_buffer = NULL;
+    frame_worker_data->scratch_buffer_size = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+      return VPX_CODEC_MEM_ERROR;
+    }
+
+    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+      return VPX_CODEC_MEM_ERROR;
+    }
+#endif
+    // If decoding in serial mode, FrameWorker thread could create tile worker
+    // thread or loopfilter thread.
+    frame_worker_data->pbi->max_threads =
+        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
+
+    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
+    frame_worker_data->pbi->common.frame_parallel_decode =
+        ctx->frame_parallel_decode;
+    worker->hook = (VP9WorkerHook)frame_worker_hook;
+    if (!winterface->reset(worker)) {
+      set_error_detail(ctx, "Frame Worker thread creation failed");
+      return VPX_CODEC_MEM_ERROR;
+    }
+  }
 
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
@@ -266,20 +447,24 @@ static void init_decoder(vpx_codec_alg_priv_t *ctx) {
     set_default_ppflags(&ctx->postproc_cfg);
 
   init_buffer_callbacks(ctx);
+
+  return VPX_CODEC_OK;
+}
+
+static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
+                                const VP9Decoder *const pbi) {
+  // Clear resync flag if worker got a key frame or intra only frame.
+  if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+      (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+    ctx->need_resync = 0;
 }
 
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = {0, 0, 0};
-  VP9_COMMON *cm = NULL;
-
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   (void)deadline;
 
-  vp9_zero(sd);
-  ctx->img_avail = 0;
-
   // Determine the stream parameters. Note that we rely on peek_si to
   // validate that we have a buffer that does not wrap around the top
   // of the heap.
@@ -295,36 +480,104 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
       return VPX_CODEC_ERROR;
   }
 
-  // Initialize the decoder instance on the first frame
-  if (ctx->pbi == NULL) {
-    init_decoder(ctx);
-    if (ctx->pbi == NULL)
-      return VPX_CODEC_ERROR;
-  }
-
-  // Set these even if already initialized.  The caller may have changed the
-  // decrypt config between frames.
-  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
-  ctx->pbi->decrypt_state = ctx->decrypt_state;
+  if (!ctx->frame_parallel_decode) {
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    frame_worker_data->data = *data;
+    frame_worker_data->data_size = data_sz;
+    frame_worker_data->user_priv = user_priv;
+    frame_worker_data->received_frame = 1;
 
-  cm = &ctx->pbi->common;
+    // Set these even if already initialized.  The caller may have changed the
+    // decrypt config between frames.
+    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
+    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
 
-  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data))
-    return update_error_state(ctx, &cm->error);
+    worker->had_error = 0;
+    winterface->execute(worker);
 
-  if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
-    set_ppflags(ctx, &flags);
+    // Update data pointer after decode.
+    *data = frame_worker_data->data_end;
 
-  if (vp9_get_raw_frame(ctx->pbi, &sd, &flags))
-    return update_error_state(ctx, &cm->error);
+    if (worker->had_error)
+      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
 
-  yuvconfig2image(&ctx->img, &sd, user_priv);
-  ctx->img.fb_priv = cm->frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-  ctx->img_avail = 1;
+    check_resync(ctx, frame_worker_data->pbi);
+  } else {
+    VP9Worker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    // Copy context from last worker thread to next worker thread.
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      vp9_frameworker_copy_context(
+          &ctx->frame_workers[ctx->next_submit_worker_id],
+          &ctx->frame_workers[ctx->last_submit_worker_id]);
+
+    frame_worker_data->pbi->ready_for_new_data = 0;
+    // Copy the compressed data into worker's internal buffer.
+    // TODO(hkuang): Will all the workers allocate the same size
+    // as the size of the first intra frame be better? This will
+    // avoid too many deallocate and allocate.
+    if (frame_worker_data->scratch_buffer_size < data_sz) {
+      frame_worker_data->scratch_buffer =
+          (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz);
+      if (frame_worker_data->scratch_buffer == NULL) {
+        set_error_detail(ctx, "Failed to reallocate scratch buffer");
+        return VPX_CODEC_MEM_ERROR;
+      }
+      frame_worker_data->scratch_buffer_size = data_sz;
+    }
+    frame_worker_data->data_size = data_sz;
+    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
+
+    frame_worker_data->frame_decoded = 0;
+    frame_worker_data->frame_context_ready = 0;
+    frame_worker_data->received_frame = 1;
+    frame_worker_data->data = frame_worker_data->scratch_buffer;
+    frame_worker_data->user_priv = user_priv;
+
+    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
+      ctx->last_submit_worker_id =
+          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
+
+    ctx->next_submit_worker_id =
+        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
+    --ctx->available_threads;
+    worker->had_error = 0;
+    winterface->launch(worker);
+  }
 
   return VPX_CODEC_OK;
 }
 
+static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
+  YV12_BUFFER_CONFIG sd;
+  vp9_ppflags_t flags = {0, 0, 0};
+  const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+  VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+  ctx->next_output_worker_id =
+      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+  // TODO(hkuang): Add worker error handling here.
+  winterface->sync(worker);
+  frame_worker_data->received_frame = 0;
+  ++ctx->available_threads;
+
+  check_resync(ctx, frame_worker_data->pbi);
+
+  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
+    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
+                    frame_worker_data->user_priv);
+    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
+        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+    ctx->frame_cache_write =
+        (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
+    ++ctx->num_cache_frames;
+  }
+}
+
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
@@ -342,6 +595,13 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
+  // Initialize the decoder workers on the first frame.
+  if (ctx->frame_workers == NULL) {
+    const vpx_codec_err_t res = init_decoder(ctx);
+    if (res != VPX_CODEC_OK)
+      return res;
+  }
+
   res = vp9_parse_superframe_index(data, data_sz, frame_sizes, &frame_count,
                                    ctx->decrypt_cb, ctx->decrypt_state);
   if (res != VPX_CODEC_OK)
@@ -358,30 +618,46 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
       for (i = 0; i < frame_count; ++i) {
         const uint8_t *data_start_copy = data_start;
         const uint32_t frame_size = frame_sizes[i];
-        vpx_codec_err_t res;
         if (data_start < data
             || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
           return VPX_CODEC_CORRUPT_FRAME;
         }
 
+        if (ctx->available_threads == 0) {
+          // No more threads for decoding. Wait until the next output worker
+          // finishes decoding. Then copy the decoded frame into cache.
+          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+            wait_worker_and_cache_frame(ctx);
+          } else {
+            // TODO(hkuang): Add unit test to test this path.
+            set_error_detail(ctx, "Frame output cache is full.");
+            return VPX_CODEC_ERROR;
+          }
+        }
+
         res = decode_one(ctx, &data_start_copy, frame_size, user_priv,
                          deadline);
         if (res != VPX_CODEC_OK)
           return res;
-
         data_start += frame_size;
       }
     } else {
-      res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
+      if (ctx->available_threads == 0) {
+        // No more threads for decoding. Wait until the next output worker
+        // finishes decoding. Then copy the decoded frame into cache.
+        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
+          wait_worker_and_cache_frame(ctx);
+        } else {
+          // TODO(hkuang): Add unit test to test this path.
+          set_error_detail(ctx, "Frame output cache is full.");
+          return VPX_CODEC_ERROR;
+        }
+      }
+
+      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
       if (res != VPX_CODEC_OK)
         return res;
-
-      // Extra data detected after the frame.
-      if (data_start < data_end - 1) {
-        ctx->base.err_detail = "Fail to decode frame in parallel mode";
-        return VPX_CODEC_INCAPABLE;
-      }
     }
   } else {
     // Decode in serial mode.
@@ -394,7 +670,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
         vpx_codec_err_t res;
         if (data_start < data
             || frame_size > (uint32_t) (data_end - data_start)) {
-          ctx->base.err_detail = "Invalid frame size in index";
+          set_error_detail(ctx, "Invalid frame size in index");
           return VPX_CODEC_CORRUPT_FRAME;
         }
 
@@ -425,24 +701,89 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
-  return VPX_CODEC_OK;
+  return res;
+}
+
+static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
+  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
+  // Decrease reference count of last output frame in frame parallel mode.
+  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
+    BufferPool *const pool = ctx->buffer_pool;
+    lock_buffer_pool(pool);
+    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
+    unlock_buffer_pool(pool);
+  }
 }
 
 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  if (ctx->img_avail) {
-    // iter acts as a flip flop, so an image is only returned on the first
-    // call to get_frame.
-    if (!(*iter)) {
-      img = &ctx->img;
-      *iter = img;
-    }
+  // Only return frame when all the cpu are busy or
+  // application fluhsed the decoder in frame parallel decode.
+  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
+      !ctx->flushed) {
+    return NULL;
+  }
+
+  // Output the frames in the cache first.
+  if (ctx->num_cache_frames > 0) {
+    release_last_output_frame(ctx);
+    ctx->last_show_frame  = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
+    if (ctx->need_resync)
+      return NULL;
+    img = &ctx->frame_cache[ctx->frame_cache_read].img;
+    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
+    --ctx->num_cache_frames;
+    return img;
   }
-  ctx->img_avail = 0;
 
-  return img;
+  // iter acts as a flip flop, so an image is only returned on the first
+  // call to get_frame.
+  if (*iter == NULL && ctx->frame_workers != NULL) {
+    do {
+      YV12_BUFFER_CONFIG sd;
+      vp9_ppflags_t flags = {0, 0, 0};
+      const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
+      VP9Worker *const worker =
+          &ctx->frame_workers[ctx->next_output_worker_id];
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      ctx->next_output_worker_id =
+          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+        set_ppflags(ctx, &flags);
+      // Wait for the frame from worker thread.
+      if (winterface->sync(worker)) {
+        // Check if worker has received any frames.
+        if (frame_worker_data->received_frame == 1) {
+          ++ctx->available_threads;
+          frame_worker_data->received_frame = 0;
+          check_resync(ctx, frame_worker_data->pbi);
+        }
+        if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
+          VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+          release_last_output_frame(ctx);
+          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
+          if (ctx->need_resync)
+            return NULL;
+          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+          img = &ctx->img;
+          return img;
+        }
+      } else {
+        // Decoding failed. Release the worker thread.
+        frame_worker_data->received_frame = 0;
+        ++ctx->available_threads;
+        ctx->need_resync = 1;
+        if (ctx->flushed != 1)
+          return NULL;
+      }
+    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  }
+  return NULL;
 }
 
 static vpx_codec_err_t decoder_set_fb_fn(
@@ -451,7 +792,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->pbi == NULL) {
+  } else if (ctx->frame_workers == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -467,12 +808,19 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&ctx->pbi->common,
+    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
                                  (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -483,13 +831,19 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
-    vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
+    vpx_ref_frame_t *frame = (vpx_ref_frame_t *) data;
     YV12_BUFFER_CONFIG sd;
-
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-
-    return vp9_copy_reference_dec(ctx->pbi,
+    return vp9_copy_reference_dec(frame_worker_data->pbi,
                                   (VP9_REFFRAME)frame->frame_type, &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
@@ -500,10 +854,18 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (data) {
-    YV12_BUFFER_CONFIG* fb = get_ref_frame(&ctx->pbi->common, data->idx);
+    YV12_BUFFER_CONFIG* fb;
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
     if (fb == NULL) return VPX_CODEC_ERROR;
-
     yuvconfig2image(&data->img, fb, NULL);
     return VPX_CODEC_OK;
   } else {
@@ -541,65 +903,122 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (update_info) {
-    if (ctx->pbi)
-      *update_info = ctx->pbi->refresh_frame_flags;
-    else
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+      return VPX_CODEC_OK;
+    } else {
       return VPX_CODEC_ERROR;
-    return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
+    }
   }
-}
 
+  return VPX_CODEC_INVALID_PARAM;
+}
 
 static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   int *corrupted = va_arg(args, int *);
 
-  if (corrupted != NULL && ctx->pbi != NULL) {
-    const YV12_BUFFER_CONFIG *const frame = ctx->pbi->common.frame_to_show;
-    if (frame == NULL) return VPX_CODEC_ERROR;
-    *corrupted = frame->corrupted;
-    return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
+  if (corrupted) {
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      RefCntBuffer *const frame_bufs =
+          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
+      if (frame_worker_data->pbi->common.frame_to_show == NULL)
+        return VPX_CODEC_ERROR;
+      *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  }
+
+  return VPX_CODEC_INVALID_PARAM;
+}
+
+static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  int *const frame_size = va_arg(args, int *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (frame_size) {
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+      frame_size[0] = cm->width;
+      frame_size[1] = cm->height;
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
   }
+
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_display_size(vpx_codec_alg_priv_t *ctx,
                                              va_list args) {
   int *const display_size = va_arg(args, int *);
 
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
   if (display_size) {
-    if (ctx->pbi) {
-      const VP9_COMMON *const cm = &ctx->pbi->common;
+    if (ctx->frame_workers) {
+      VP9Worker *const worker = ctx->frame_workers;
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
       display_size[0] = cm->display_width;
       display_size[1] = cm->display_height;
+      return VPX_CODEC_OK;
     } else {
       return VPX_CODEC_ERROR;
     }
-    return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   unsigned int *const bit_depth = va_arg(args, unsigned int *);
+  VP9Worker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
 
   if (bit_depth) {
-    if (ctx->pbi) {
-      const VP9_COMMON *const cm = &ctx->pbi->common;
+    if (worker) {
+      FrameWorkerData *const frame_worker_data =
+          (FrameWorkerData *)worker->data1;
+      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
       *bit_depth = cm->bit_depth;
       return VPX_CODEC_OK;
     } else {
       return VPX_CODEC_ERROR;
     }
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
@@ -616,6 +1035,29 @@ static vpx_codec_err_t ctrl_set_decryptor(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
+                                               va_list args) {
+  const int legacy_byte_alignment = 0;
+  const int min_byte_alignment = 32;
+  const int max_byte_alignment = 1024;
+  const int byte_alignment = va_arg(args, int);
+
+  if (byte_alignment != legacy_byte_alignment &&
+      (byte_alignment < min_byte_alignment ||
+       byte_alignment > max_byte_alignment ||
+       (byte_alignment & (byte_alignment - 1)) != 0))
+    return VPX_CODEC_INVALID_PARAM;
+
+  ctx->byte_alignment = byte_alignment;
+  if (ctx->frame_workers) {
+    VP9Worker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data =
+        (FrameWorkerData *)worker->data1;
+    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,            ctrl_copy_reference},
 
@@ -628,6 +1070,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8_SET_DBG_DISPLAY_MV,        ctrl_set_dbg_options},
   {VP9_INVERT_TILE_DECODE_ORDER,  ctrl_set_invert_tile_order},
   {VPXD_SET_DECRYPTOR,            ctrl_set_decryptor},
+  {VP9_SET_BYTE_ALIGNMENT,        ctrl_set_byte_alignment},
 
   // Getters
   {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},
@@ -635,6 +1078,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP9_GET_REFERENCE,             ctrl_get_reference},
   {VP9D_GET_DISPLAY_SIZE,         ctrl_get_display_size},
   {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
+  {VP9D_GET_FRAME_SIZE,           ctrl_get_frame_size},
 
   { -1, NULL},
 };
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h
index 00fbfdd7dbb..e585aa14725 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_iface_common.h
@@ -34,6 +34,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG  *yv12,
       bps = 12;
     }
   }
+  img->cs = yv12->color_space;
   img->bit_depth = 8;
   img->w = yv12->y_stride;
   img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3);
@@ -92,6 +93,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img,
 
   yv12->y_stride = img->stride[VPX_PLANE_Y];
   yv12->uv_stride = img->stride[VPX_PLANE_U];
+  yv12->color_space = img->cs;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
index e72cb0024f5..7359b2de05d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9cx.mk
@@ -24,13 +24,17 @@ VP9_CX_SRCS-yes += encoder/vp9_context_tree.h
 VP9_CX_SRCS-yes += encoder/vp9_cost.h
 VP9_CX_SRCS-yes += encoder/vp9_cost.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.c
 VP9_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/vp9_denoiser.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
+VP9_CX_SRCS-yes += encoder/vp9_ethread.h
+VP9_CX_SRCS-yes += encoder/vp9_ethread.c
 VP9_CX_SRCS-yes += encoder/vp9_extend.c
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_fastssim.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
 VP9_CX_SRCS-yes += encoder/vp9_writer.h
@@ -59,12 +63,12 @@ VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
 VP9_CX_SRCS-yes += encoder/vp9_encoder.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_psnrhvs.c
 VP9_CX_SRCS-yes += encoder/vp9_quantize.c
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rd.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
-VP9_CX_SRCS-yes += encoder/vp9_sad.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-yes += encoder/vp9_speed_features.c
@@ -76,6 +80,8 @@ VP9_CX_SRCS-yes += encoder/vp9_resize.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.h
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
+
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
 VP9_CX_SRCS-yes += encoder/vp9_variance.c
@@ -85,6 +91,8 @@ VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_complexity.h
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.c
+VP9_CX_SRCS-yes += encoder/vp9_skin_detection.h
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
@@ -95,34 +103,39 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad4d_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+endif
 
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
+endif
 endif
 
 ifeq ($(ARCH_X86_64),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm
 endif
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_sad_intrin_avx2.c
 VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt_x86_64.asm
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.h
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_impl_sse2.c
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
@@ -133,10 +146,12 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk b/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk
index 1fcb36f668c..c105adb7967 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9dx.mk
@@ -21,14 +21,14 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
 VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.h
 VP9_DX_SRCS-yes += decoder/vp9_reader.c
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
+VP9_DX_SRCS-yes += decoder/vp9_dthread.c
+VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc b/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc
index 88859206545..e4707ba1082 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/exports_enc
@@ -1,5 +1,6 @@
 text vpx_codec_enc_config_default
 text vpx_codec_enc_config_set
+text vpx_codec_enc_init_multi_ver
 text vpx_codec_enc_init_ver
 text vpx_codec_encode
 text vpx_codec_get_cx_data
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h b/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
index cbfffd0af2a..7380fcc7e24 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/internal/vpx_codec_internal.h
@@ -425,10 +425,18 @@ struct vpx_internal_error_info {
   jmp_buf          jmp;
 };
 
+#define CLANG_ANALYZER_NORETURN
+#if defined(__has_feature)
+#if __has_feature(attribute_analyzer_noreturn)
+#undef CLANG_ANALYZER_NORETURN
+#define CLANG_ANALYZER_NORETURN __attribute__((analyzer_noreturn))
+#endif
+#endif
+
 void vpx_internal_error(struct vpx_internal_error_info *info,
                         vpx_codec_err_t                 error,
                         const char                     *fmt,
-                        ...);
+                        ...) CLANG_ANALYZER_NORETURN;
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
index fa3409c6983..e711cf909ba 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c
@@ -44,8 +44,6 @@ _CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
 #define SVC_REFERENCE_FRAMES 8
 #define SUPERFRAME_SLOTS (8)
 #define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2)
-#define OPTION_BUFFER_SIZE 1024
-#define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
 
 #define MAX_QUANTIZER 63
 
@@ -81,52 +79,26 @@ typedef struct FrameData {
   struct FrameData         *next;
 } FrameData;
 
-typedef struct SvcInternal {
-  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
-
-  // values extracted from option, quantizers
-  vpx_svc_extra_cfg_t svc_params;
-  int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
-  int bitrates[VPX_SS_MAX_LAYERS];
-
-  // accumulated statistics
-  double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
-  uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
-  uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
-
-  // codec encoding values
-  int width;    // width of highest layer
-  int height;   // height of highest layer
-  int kf_dist;  // distance between keyframes
-
-  // state variables
-  int psnr_pkt_received;
-  int layer;
-  int use_multiple_frame_contexts;
-
-  char message_buffer[2048];
-  vpx_codec_ctx_t *codec_ctx;
-} SvcInternal;
-
-static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
+static SvcInternal_t *get_svc_internal(SvcContext *svc_ctx) {
   if (svc_ctx == NULL) return NULL;
   if (svc_ctx->internal == NULL) {
-    SvcInternal *const si = (SvcInternal *)malloc(sizeof(*si));
+    SvcInternal_t *const si = (SvcInternal_t *)malloc(sizeof(*si));
     if (si != NULL) {
       memset(si, 0, sizeof(*si));
     }
     svc_ctx->internal = si;
   }
-  return (SvcInternal *)svc_ctx->internal;
+  return (SvcInternal_t *)svc_ctx->internal;
 }
 
-static const SvcInternal *get_const_svc_internal(const SvcContext *svc_ctx) {
+static const SvcInternal_t *get_const_svc_internal(
+    const SvcContext *svc_ctx) {
   if (svc_ctx == NULL) return NULL;
-  return (const SvcInternal *)svc_ctx->internal;
+  return (const SvcInternal_t *)svc_ctx->internal;
 }
 
 static void svc_log_reset(SvcContext *svc_ctx) {
-  SvcInternal *const si = (SvcInternal *)svc_ctx->internal;
+  SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal;
   si->message_buffer[0] = '\0';
 }
 
@@ -135,7 +107,7 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level,
   char buf[512];
   int retval = 0;
   va_list ap;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
 
   if (level > svc_ctx->log_level) {
     return retval;
@@ -233,7 +205,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   char *option_name;
   char *option_value;
   char *input_ptr;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   vpx_codec_err_t res = VPX_CODEC_OK;
   int i, alt_ref_enabled = 0;
 
@@ -315,8 +287,9 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
   return res;
 }
 
-vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx,
+                                    const char *options) {
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || options == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -328,7 +301,7 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
 void assign_layer_bitrates(const SvcContext *svc_ctx,
                            vpx_codec_enc_cfg_t *const enc_cfg) {
   int i;
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
 
   if (si->bitrates[0] != 0) {
     enc_cfg->rc_target_bitrate = 0;
@@ -364,7 +337,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_enc_cfg_t *enc_cfg) {
   vpx_codec_err_t res;
   int i;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
       enc_cfg == NULL) {
     return VPX_CODEC_INVALID_PARAM;
@@ -454,13 +427,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
  * Encode a frame into multiple layers
  * Create a superframe containing the individual layers
  */
-vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
-                               struct vpx_image *rawimg, vpx_codec_pts_t pts,
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+                               vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg,
+                               vpx_codec_pts_t pts,
                                int64_t duration, int deadline) {
   vpx_codec_err_t res;
   vpx_codec_iter_t iter;
   const vpx_codec_cx_pkt_t *cx_pkt;
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -524,7 +499,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
 }
 
 const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return NULL;
   return si->message_buffer;
 }
@@ -544,7 +519,7 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   double mse[COMPONENTS];
   double y_scale;
 
-  SvcInternal *const si = get_svc_internal(svc_ctx);
+  SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || si == NULL) return NULL;
 
   svc_log_reset(svc_ctx);
@@ -595,11 +570,11 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
 }
 
 void vpx_svc_release(SvcContext *svc_ctx) {
-  SvcInternal *si;
+  SvcInternal_t *si;
   if (svc_ctx == NULL) return;
   // do not use get_svc_internal as it will unnecessarily allocate an
-  // SvcInternal if it was not already allocated
-  si = (SvcInternal *)svc_ctx->internal;
+  // SvcInternal_t if it was not already allocated
+  si = (SvcInternal_t *)svc_ctx->internal;
   if (si != NULL) {
     free(si);
     svc_ctx->internal = NULL;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
index 61b5f4ba0cc..cf791bdeb56 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h
@@ -41,6 +41,36 @@ typedef struct {
   void *internal;
 } SvcContext;
 
+#define OPTION_BUFFER_SIZE 1024
+#define COMPONENTS 4  // psnr & sse statistics maintained for total, y, u, v
+
+typedef struct SvcInternal {
+  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
+
+  // values extracted from option, quantizers
+  vpx_svc_extra_cfg_t svc_params;
+  int enable_auto_alt_ref[VPX_SS_MAX_LAYERS];
+  int bitrates[VPX_SS_MAX_LAYERS];
+
+  // accumulated statistics
+  double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS];   // total/Y/U/V
+  uint64_t sse_sum[VPX_SS_MAX_LAYERS][COMPONENTS];
+  uint32_t bytes_sum[VPX_SS_MAX_LAYERS];
+
+  // codec encoding values
+  int width;    // width of highest layer
+  int height;   // height of highest layer
+  int kf_dist;  // distance between keyframes
+
+  // state variables
+  int psnr_pkt_received;
+  int layer;
+  int use_multiple_frame_contexts;
+
+  char message_buffer[2048];
+  vpx_codec_ctx_t *codec_ctx;
+} SvcInternal_t;
+
 /**
  * Set SVC options
  * options are supplied as a single string separated by spaces
@@ -54,14 +84,17 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options);
 /**
  * initialize SVC encoding
  */
-vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx,
+                             vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *cfg);
 /**
  * encode a frame of video with multiple layers
  */
-vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
-                               struct vpx_image *rawimg, vpx_codec_pts_t pts,
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx,
+                               vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg,
+                               vpx_codec_pts_t pts,
                                int64_t duration, int deadline);
 
 /**
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
index 77d9d6a1c52..0e8adc134c5 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h
@@ -10,15 +10,16 @@
 #ifndef VPX_VP8CX_H_
 #define VPX_VP8CX_H_
 
-/*!\defgroup vp8_encoder WebM VP8 Encoder
+/*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
  * \ingroup vp8
  *
  * @{
  */
 #include "./vp8.h"
+#include "./vpx_encoder.h"
 
 /*!\file
- * \brief Provides definitions for using the VP8 encoder algorithm within the
+ * \brief Provides definitions for using VP8 or VP9 encoder algorithm within the
  *        vpx Codec Interface.
  */
 
@@ -28,17 +29,20 @@ extern "C" {
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to encode raw VP8 streams, as would
- * be found in AVI files.
+ * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
+/*!@} - end algorithm interface member group*/
 
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to encode raw VP9 streams.
+ * @{
+ */
 extern vpx_codec_iface_t  vpx_codec_vp9_cx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
-
 /*!@} - end algorithm interface member group*/
 
 
@@ -121,66 +125,145 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 #define VP8_EFLAG_NO_UPD_ENTROPY   (1<<20)
 
 
-/*!\brief VP8 encoder control functions
+/*!\brief VPx encoder control functions
  *
- * This set of macros define the control functions available for the VP8
+ * This set of macros define the control functions available for VPx
  * encoder interface.
  *
  * \sa #vpx_codec_control
  */
 enum vp8e_enc_control_id {
-  VP8E_UPD_ENTROPY           = 5,  /**< control function to set mode of entropy update in encoder */
-  VP8E_UPD_REFERENCE,              /**< control function to set reference update mode in encoder */
-  VP8E_USE_REFERENCE,              /**< control function to set which reference frame encoder can use */
-  VP8E_SET_ROI_MAP,                /**< control function to pass an ROI map to encoder */
-  VP8E_SET_ACTIVEMAP,              /**< control function to pass an Active map to encoder */
-  VP8E_SET_SCALEMODE         = 11, /**< control function to set encoder scaling mode */
-  /*!\brief control function to set vp8 encoder cpuused
+  /*!\brief Codec control function to set mode of entropy update in encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_UPD_ENTROPY           = 5,
+
+  /*!\brief Codec control function to set reference update mode in encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_UPD_REFERENCE,
+
+  /*!\brief Codec control function to set which reference frame encoder can use.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_USE_REFERENCE,
+
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ROI_MAP,
+
+  /*!\brief Codec control function to pass an Active map to encoder.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ACTIVEMAP,
+
+  /*!\brief Codec control function to set encoder scaling mode.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_SCALEMODE         = 11,
+
+  /*!\brief Codec control function to set encoder internal speed settings.
    *
    * Changes in this value influences, among others, the encoder's selection
    * of motion estimation methods. Values greater than 0 will increase encoder
    * speed at the expense of quality.
-   * The full set of adjustments can be found in
-   * onyx_if.c:vp8_set_speed_features().
-   * \todo List highlights of the changes at various levels.
    *
-   * \note Valid range: -16..16
+   * \note Valid range for VP8: -16..16
+   * \note Valid range for VP9: -8..8
+   *
+   * Supported in codecs: VP8, VP9
    */
   VP8E_SET_CPUUSED           = 13,
-  VP8E_SET_ENABLEAUTOALTREF,       /**< control function to enable vp8 to automatic set and use altref frame */
+
+  /*!\brief Codec control function to enable automatic set and use alf frames.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ENABLEAUTOALTREF,
+
   /*!\brief control function to set noise sensitivity
    *
    * 0: off, 1: OnYOnly, 2: OnYUV,
    * 3: OnYUVAggressive, 4: Adaptive
+   *
+   * Supported in codecs: VP8
    */
   VP8E_SET_NOISE_SENSITIVITY,
-  VP8E_SET_SHARPNESS,              /**< control function to set sharpness */
-  VP8E_SET_STATIC_THRESHOLD,       /**< control function to set the threshold for macroblocks treated static */
-  VP8E_SET_TOKEN_PARTITIONS,       /**< control function to set the number of token partitions  */
-  VP8E_GET_LAST_QUANTIZER,         /**< return the quantizer chosen by the
-                                          encoder for the last frame using the internal
-                                          scale */
-  VP8E_GET_LAST_QUANTIZER_64,      /**< return the quantizer chosen by the
-                                          encoder for the last frame, using the 0..63
-                                          scale as used by the rc_*_quantizer config
-                                          parameters */
-  VP8E_SET_ARNR_MAXFRAMES,         /**< control function to set the max number of frames blurred creating arf*/
-  VP8E_SET_ARNR_STRENGTH,          //!< control function to set the filter
-                                   //!< strength for the arf
-
-  /*!\deprecated control function to set the filter type to use for the arf */
+
+  /*!\brief Codec control function to set sharpness.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_SHARPNESS,
+
+  /*!\brief Codec control function to set the threshold for MBs treated static.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_STATIC_THRESHOLD,
+
+  /*!\brief Codec control function to set the number of token partitions.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_TOKEN_PARTITIONS,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses internal quantizer scale defined by the codec.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_GET_LAST_QUANTIZER,
+
+  /*!\brief Codec control function to get last quantizer chosen by the encoder.
+   *
+   * Return value uses the 0..63 scale as used by the rc_*_quantizer config
+   * parameters.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_GET_LAST_QUANTIZER_64,
+
+  /*!\brief Codec control function to set the max no of frames to create arf.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ARNR_MAXFRAMES,
+
+  /*!\brief Codec control function to set the filter strength for the arf.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_ARNR_STRENGTH,
+
+  /*!\deprecated control function to set the filter type to use for the arf. */
   VP8E_SET_ARNR_TYPE,
 
-  VP8E_SET_TUNING,                 /**< control function to set visual tuning */
-  /*!\brief control function to set constrained quality level
+  /*!\brief Codec control function to set visual tuning.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_SET_TUNING,
+
+  /*!\brief Codec control function to set constrained quality level.
    *
    * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be
    *            set to #VPX_CQ.
    * \note Valid range: 0..63
+   *
+   * Supported in codecs: VP8, VP9
    */
   VP8E_SET_CQ_LEVEL,
 
-  /*!\brief Max data rate for Intra frames
+  /*!\brief Codec control function to set Max data rate for Intra frames.
    *
    * This value controls additional clamping on the maximum size of a
    * keyframe. It is expressed as a percentage of the average
@@ -191,32 +274,246 @@ enum vp8e_enc_control_id {
    * For example, to allocate no more than 4.5 frames worth of bitrate
    * to a keyframe, set this to 450.
    *
+   * Supported in codecs: VP8, VP9
    */
   VP8E_SET_MAX_INTRA_BITRATE_PCT,
 
+  /*!\brief Codec control function to set reference and update frame flags.
+   *
+   *  Supported in codecs: VP8
+   */
+  VP8E_SET_FRAME_FLAGS,
+
+  /*!\brief Codec control function to set max data rate for Inter frames.
+   *
+   * This value controls additional clamping on the maximum size of an
+   * inter frame. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * unlimited, or no additional clamping beyond the codec's built-in
+   * algorithm.
+   *
+   * For example, to allow no more than 4.5 frames worth of bitrate
+   * to an inter frame, set this to 450.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MAX_INTER_BITRATE_PCT,
 
-  /* TODO(jkoleszar): Move to vp9cx.h */
+  /*!\brief Boost percentage for Golden Frame in CBR mode.
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * than average frame, set this to 100.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_GF_CBR_BOOST_PCT,
+
+  /*!\brief Codec control function to set the temporal layer id.
+   *
+   * For temporal scalability: this control allows the application to set the
+   * layer id for each frame to be encoded. Note that this control must be set
+   * for every frame prior to encoding. The usage of this control function
+   * supersedes the internal temporal pattern counter, which is now deprecated.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_TEMPORAL_LAYER_ID,
+
+  /*!\brief Codec control function to set encoder screen content mode.
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_SCREEN_CONTENT_MODE,
+
+  /*!\brief Codec control function to set lossless encoding mode.
+   *
+   * VP9 can operate in lossless encoding mode, in which the bitstream
+   * produced will be able to decode and reconstruct a perfect copy of
+   * input source. This control function provides a mean to switch encoder
+   * into lossless coding mode(1) or normal coding mode(0) that may be lossy.
+   *                          0 = lossy coding mode
+   *                          1 = lossless coding mode
+   *
+   *  By default, encoder operates in normal coding mode (maybe lossy).
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_LOSSLESS,
+
+  /*!\brief Codec control function to set number of tile columns.
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated vertical tile columns, which can be encoded or decoded
+   * independently. This enables easy implementation of parallel encoding and
+   * decoding. This control requests the encoder to use column tiles in
+   * encoding an input frame, with number of tile columns (in Log2 unit) as
+   * the parameter:
+   *             0 = 1 tile column
+   *             1 = 2 tile columns
+   *             2 = 4 tile columns
+   *             .....
+   *             n = 2**n tile columns
+   * The requested tile columns will be capped by encoder based on image size
+   * limitation (The minimum width of a tile column is 256 pixel, the maximum
+   * is 4096).
+   *
+   * By default, the value is 0, i.e. one single column tile for entire image.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_TILE_COLUMNS,
+
+  /*!\brief Codec control function to set number of tile rows.
+   *
+   * In encoding and decoding, VP9 allows an input image frame be partitioned
+   * into separated horizontal tile rows. Tile rows are encoded or decoded
+   * sequentially. Even though encoding/decoding of later tile rows depends on
+   * earlier ones, this allows the encoder to output data packets for tile rows
+   * prior to completely processing all tile rows in a frame, thereby reducing
+   * the latency in processing between input and output. The parameter
+   * for this control describes the number of tile rows, which has a valid
+   * range [0, 2]:
+   *            0 = 1 tile row
+   *            1 = 2 tile rows
+   *            2 = 4 tile rows
+   *
+   * By default, the value is 0, i.e. one single row tile for entire image.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_TILE_ROWS,
+
+  /*!\brief Codec control function to enable frame parallel decoding feature.
+   *
+   * VP9 has a bitstream feature to reduce decoding dependency between frames
+   * by turning off backward update of probability context used in encoding
+   * and decoding. This allows staged parallel processing of more than one
+   * video frames in the decoder. This control function provides a mean to
+   * turn this feature on or off for bitstreams produced by encoder.
+   *
+   * By default, this feature is off.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_FRAME_PARALLEL_DECODING,
+
+  /*!\brief Codec control function to set adaptive quantization mode.
+   *
+   * VP9 has a segment based feature that allows encoder to adaptively change
+   * quantization parameter for each segment within a frame to improve the
+   * subjective quality. This control makes encoder operate in one of the
+   * several AQ_modes supported.
+   *
+   * By default, encoder operates with AQ_Mode 0(adaptive quantization off).
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_AQ_MODE,
+
+  /*!\brief Codec control function to enable/disable periodic Q boost.
+   *
+   * One VP9 encoder speed feature is to enable quality boost by lowering
+   * frame level Q periodically. This control function provides a mean to
+   * turn on/off this feature.
+   *               0 = off
+   *               1 = on
+   *
+   * By default, the encoder is allowed to use this feature for appropriate
+   * encoding modes.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_FRAME_PERIODIC_BOOST,
-  /*!\brief control function to set noise sensitivity
+
+  /*!\brief Codec control function to set noise sensitivity.
+   *
+   *  0: off, 1: On(YOnly)
    *
-   *  0: off, 1: OnYOnly
+   * Supported in codecs: VP9
    */
   VP9E_SET_NOISE_SENSITIVITY,
 
+  /*!\brief Codec control function to turn on/off SVC in encoder.
+   * \note Return value is VPX_CODEC_INVALID_PARAM if the encoder does not
+   *       support SVC in its current encoding mode
+   *  0: off, 1: on
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_SVC,
+
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+  /*!\brief Codec control function to set parameters for SVC.
+   * \note Parameters contain min_q, max_q, scaling factor for each of the
+   *       SVC layers.
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_SVC_PARAMETERS,
-  /*!\brief control function to set svc layer for spatial and temporal.
+#endif
+
+  /*!\brief Codec control function to set svc layer for spatial and temporal.
    * \note Valid ranges: 0..#vpx_codec_enc_cfg::ss_number_layers for spatial
    *                     layer and 0..#vpx_codec_enc_cfg::ts_number_layers for
    *                     temporal layer.
+   *
+   * Supported in codecs: VP9
    */
   VP9E_SET_SVC_LAYER_ID,
-  VP9E_SET_TUNE_CONTENT
+
+  /*!\brief Codec control function to set content type.
+   * \note Valid parameter range:
+   *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
+   *              VP9E_CONTENT_SCREEN  = Screen capture content
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_TUNE_CONTENT,
+
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+  /*!\brief Codec control function to get svc layer ID.
+   * \note The layer ID returned is for the data packet from the registered
+   *       callback function.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_SVC_LAYER_ID,
+
+  /*!\brief Codec control function to register callback to get per layer packet.
+   * \note Parameter for this control function is a structure with a callback
+   *       function and a pointer to private data used by the callback.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_REGISTER_CX_CALLBACK,
+#endif
+
+  /*!\brief Codec control function to set color space info.
+   * \note Valid ranges: 0..7, default is "UNKNOWN".
+   *                     0 = UNKNOWN,
+   *                     1 = BT_601
+   *                     2 = BT_709
+   *                     3 = SMPTE_170
+   *                     4 = SMPTE_240
+   *                     5 = BT_2020
+   *                     6 = RESERVED
+   *                     7 = SRGB
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_COLOR_SPACE,
+
+  /*!\brief Codec control function to get an Active map back from the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_ACTIVEMAP,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -305,6 +602,7 @@ typedef enum {
   VP8_TUNE_SSIM
 } vp8e_tuning;
 
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
 /*!\brief  vp9 svc layer parameters
  *
  * This defines the spatial and temporal layer id numbers for svc encoding.
@@ -316,6 +614,18 @@ typedef struct vpx_svc_layer_id {
   int spatial_layer_id;       /**< Spatial layer id number. */
   int temporal_layer_id;      /**< Temporal layer id number. */
 } vpx_svc_layer_id_t;
+#else
+/*!\brief  vp9 svc layer parameters
+ *
+ * This defines the temporal layer id numbers for svc encoding.
+ * This is used with the #VP9E_SET_SVC_LAYER_ID control to set the
+ * temporal layer id for the current frame.
+ *
+ */
+typedef struct vpx_svc_layer_id {
+  int temporal_layer_id;      /**< Temporal layer id number. */
+} vpx_svc_layer_id_t;
+#endif
 
 /*!\brief VP8 encoder control function parameter type
  *
@@ -332,12 +642,17 @@ VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_UPD_ENTROPY,            int)
 VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_UPD_REFERENCE,          int)
 VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_USE_REFERENCE,          int)
 
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS,        int)
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID,  int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP,            vpx_roi_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP,          vpx_active_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE,          vpx_scaling_mode_t *)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC,                int)
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS,     void *)
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK,   void *)
+#endif
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID,       vpx_svc_layer_id_t *)
 
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED,            int)
@@ -358,8 +673,16 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS,  int)
 
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER,     int *)
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64,  int *)
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID,  vpx_svc_layer_id_t *)
+#endif
 
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+
+VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 
@@ -372,6 +695,10 @@ VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY,  unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
+
+VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
+
+VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
index 379b3062089..83898bf8496 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h
@@ -9,13 +9,13 @@
  */
 
 
-/*!\defgroup vp8_decoder WebM VP8 Decoder
+/*!\defgroup vp8_decoder WebM VP8/VP9 Decoder
  * \ingroup vp8
  *
  * @{
  */
 /*!\file
- * \brief Provides definitions for using the VP8 algorithm within the vpx Decoder
+ * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
  *        interface.
  */
 #ifndef VPX_VP8DX_H_
@@ -30,14 +30,18 @@ extern "C" {
 
 /*!\name Algorithm interface for VP8
  *
- * This interface provides the capability to decode raw VP8 streams, as would
- * be found in AVI files and other non-Flash uses.
+ * This interface provides the capability to decode VP8 streams.
  * @{
  */
 extern vpx_codec_iface_t  vpx_codec_vp8_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
+/*!@} - end algorithm interface member group*/
 
-/* TODO(jkoleszar): These move to VP9 in a later patch set. */
+/*!\name Algorithm interface for VP9
+ *
+ * This interface provides the capability to decode VP9 streams.
+ * @{
+ */
 extern vpx_codec_iface_t  vpx_codec_vp9_dx_algo;
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
@@ -72,13 +76,34 @@ enum vp8_dec_control_id {
   VPXD_SET_DECRYPTOR,
   VP8D_SET_DECRYPTOR = VPXD_SET_DECRYPTOR,
 
-  /** control function to get the display dimensions for the current frame. */
+  /** control function to get the dimensions that the current frame is decoded
+   * at. This may be different to the intended display size for the frame as
+   * specified in the wrapper or frame header (see VP9D_GET_DISPLAY_SIZE). */
+  VP9D_GET_FRAME_SIZE,
+
+  /** control function to get the current frame's intended display dimensions
+   * (as specified in the wrapper or frame header). This may be different to
+   * the decoded dimensions of this frame (see VP9D_GET_FRAME_SIZE). */
   VP9D_GET_DISPLAY_SIZE,
 
   /** control function to get the bit depth of the stream. */
   VP9D_GET_BIT_DEPTH,
 
-  /** For testing. */
+  /** control function to set the byte alignment of the planes in the reference
+   * buffers. Valid values are power of 2, from 32 to 1024. A value of 0 sets
+   * legacy alignment. I.e. Y plane is aligned to 32 bytes, U plane directly
+   * follows Y plane, and V plane directly follows U plane. Default value is 0.
+   */
+  VP9_SET_BYTE_ALIGNMENT,
+
+  /** control function to invert the decoding order to from right to left. The
+   * function is used in a test to confirm the decoding independence of tile
+   * columns. The function may be used in application where this order
+   * of decoding is desired.
+   *
+   * TODO(yaowu): Rework the unit test that uses this control, and in a future
+   *              release, this test-only control shall be removed.
+   */
   VP9_INVERT_TILE_DECODE_ORDER,
 
   VP8_DECODER_CTRL_ID_MAX
@@ -122,6 +147,7 @@ VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR,           vpx_decrypt_init *)
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR,           vpx_decrypt_init *)
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE,        int *)
 VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH,           unsigned int *)
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE,          int *)
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 
 /*! @} - end defgroup vp8_decoder */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
index b25308ed9a6..b94e17370a2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_codec.h
@@ -83,7 +83,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_CODEC_ABI_VERSION (2 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_CODEC_ABI_VERSION (3 + VPX_IMAGE_ABI_VERSION) /**<\hideinitializer*/
 
   /*!\brief Algorithm return codes */
   typedef enum {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
index 044243d6b73..bf75584d589 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_encoder.h
@@ -59,7 +59,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_ENCODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION (4 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 
   /*! \brief Encoder capabilities bitfield
@@ -161,9 +161,9 @@ extern "C" {
     VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
     VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
     VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
-    // TODO(minghai): This is for testing purporses. The released library can't
-    // depend on vpx_config.h
-#if defined(CONFIG_SPATIAL_SVC) && CONFIG_SPATIAL_SVC
+    // Spatial SVC is still experimental and may be removed before the next ABI
+    // bump.
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
     VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
     VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
 #endif
@@ -203,9 +203,9 @@ extern "C" {
         double       psnr[4];     /**< PSNR, total/y/u/v */
       } psnr;                       /**< data for PSNR packet */
       vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
-      // TODO(minghai): This is for testing purporses. The released library
-      // can't depend on vpx_config.h
-#if defined(CONFIG_SPATIAL_SVC) && CONFIG_SPATIAL_SVC
+      // Spatial SVC is still experimental and may be removed before the next
+      // ABI bump.
+#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
       size_t layer_sizes[VPX_SS_MAX_LAYERS];
       struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
 #endif
@@ -220,6 +220,22 @@ extern "C" {
   } vpx_codec_cx_pkt_t; /**< alias for struct vpx_codec_cx_pkt */
 
 
+  /*!\brief Encoder return output buffer callback
+   *
+   * This callback function, when registered, returns with packets when each
+   * spatial layer is encoded.
+   */
+  // putting the definitions here for now. (agrange: find if there
+  // is a better place for this)
+  typedef void (* vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt,
+                                                       void *user_data);
+
+  /*!\brief Callback function pointer / user data pair storage */
+  typedef struct vpx_codec_enc_output_cx_cb_pair {
+    vpx_codec_enc_output_cx_pkt_cb_fn_t output_cx_pkt; /**< Callback function */
+    void                            *user_priv; /**< Pointer to private data */
+  } vpx_codec_priv_output_cx_pkt_cb_pair_t;
+
   /*!\brief Rational Number
    *
    * This structure holds a fractional value.
@@ -721,10 +737,10 @@ extern "C" {
    *
    */
   typedef struct vpx_svc_parameters {
-    int max_quantizers[VPX_SS_MAX_LAYERS];
-    int min_quantizers[VPX_SS_MAX_LAYERS];
-    int scaling_factor_num[VPX_SS_MAX_LAYERS];
-    int scaling_factor_den[VPX_SS_MAX_LAYERS];
+    int max_quantizers[VPX_SS_MAX_LAYERS]; /**< Max Q for each layer */
+    int min_quantizers[VPX_SS_MAX_LAYERS]; /**< Min Q for each layer */
+    int scaling_factor_num[VPX_SS_MAX_LAYERS]; /**< Scaling factor-numerator*/
+    int scaling_factor_den[VPX_SS_MAX_LAYERS]; /**< Scaling factor-denominator*/
   } vpx_svc_extra_cfg_t;
 
 
@@ -811,9 +827,9 @@ extern "C" {
    * be called by all applications to initialize the configuration structure
    * before specializing the configuration with application specific values.
    *
-   * \param[in]    iface   Pointer to the algorithm interface to use.
-   * \param[out]   cfg     Configuration buffer to populate
-   * \param[in]    usage   End usage. Set to 0 or use codec specific values.
+   * \param[in]    iface     Pointer to the algorithm interface to use.
+   * \param[out]   cfg       Configuration buffer to populate.
+   * \param[in]    reserved  Must set to 0 for VP8 and VP9.
    *
    * \retval #VPX_CODEC_OK
    *     The configuration was populated.
@@ -824,7 +840,7 @@ extern "C" {
    */
   vpx_codec_err_t  vpx_codec_enc_config_default(vpx_codec_iface_t    *iface,
                                                 vpx_codec_enc_cfg_t  *cfg,
-                                                unsigned int          usage);
+                                                unsigned int          reserved);
 
 
   /*!\brief Set or change configuration
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h
index 41038b10df6..9036459af0a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_frame_buffer.h
@@ -22,8 +22,11 @@ extern "C" {
 #include "./vpx_integer.h"
 
 /*!\brief The maximum number of work buffers used by libvpx.
+ *  Support maximum 4 threads to decode video in parallel.
+ *  Each thread will use one work buffer.
+ * TODO(hkuang): Add support to set number of worker threads dynamically.
  */
-#define VPX_MAXIMUM_WORK_BUFFERS 1
+#define VPX_MAXIMUM_WORK_BUFFERS 8
 
 /*!\brief The maximum number of reference buffers that a VP9 encoder may use.
  */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h
index 337e4c4bedb..c06d35101cc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_image.h
@@ -28,7 +28,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_IMAGE_ABI_VERSION (2) /**<\hideinitializer*/
+#define VPX_IMAGE_ABI_VERSION (3) /**<\hideinitializer*/
 
 
 #define VPX_IMG_FMT_PLANAR     0x100  /**< Image is a planar format. */
@@ -66,9 +66,22 @@ extern "C" {
     VPX_IMG_FMT_I44016    = VPX_IMG_FMT_I440 | VPX_IMG_FMT_HIGHBITDEPTH
   } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
 
+  /*!\brief List of supported color spaces */
+  typedef enum vpx_color_space {
+    VPX_CS_UNKNOWN    = 0,  /**< Unknown */
+    VPX_CS_BT_601     = 1,  /**< BT.601 */
+    VPX_CS_BT_709     = 2,  /**< BT.709 */
+    VPX_CS_SMPTE_170  = 3,  /**< SMPTE.170 */
+    VPX_CS_SMPTE_240  = 4,  /**< SMPTE.240 */
+    VPX_CS_BT_2020    = 5,  /**< BT.2020 */
+    VPX_CS_RESERVED   = 6,  /**< Reserved */
+    VPX_CS_SRGB       = 7   /**< sRGB */
+  } vpx_color_space_t; /**< alias for enum vpx_color_space */
+
   /**\brief Image Descriptor */
   typedef struct vpx_image {
     vpx_img_fmt_t fmt; /**< Image Format */
+    vpx_color_space_t cs; /**< Color Space */
 
     /* Image storage dimensions */
     unsigned int  w;           /**< Stored image width */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h
index 500f9b901eb..829c9d132c8 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx/vpx_integer.h
@@ -37,6 +37,8 @@ typedef unsigned int   uint32_t;
 typedef signed __int64   int64_t;
 typedef unsigned __int64 uint64_t;
 #define INT64_MAX _I64_MAX
+#define INT32_MAX _I32_MAX
+#define INT32_MIN _I32_MIN
 #define INT16_MAX _I16_MAX
 #define INT16_MIN _I16_MIN
 #endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c
new file mode 100644
index 00000000000..c7704dc1be6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -0,0 +1,226 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+  const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
+                                        vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi),
+                                        vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+}
+
+// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
+// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
+// and vec_sum_ref_hi.
+static void sad_neon_64(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8x16_t vec_src_32,
+                        const uint8x16_t vec_src_48,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
+  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
+                             vget_low_u8(vec_ref_32));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
+                             vget_high_u8(vec_ref_32));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
+                             vget_low_u8(vec_ref_48));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
+                             vget_high_u8(vec_ref_48));
+}
+
+// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
+// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
+static void sad_neon_32(const uint8x16_t vec_src_00,
+                        const uint8x16_t vec_src_16,
+                        const uint8_t *ref,
+                        uint16x8_t *vec_sum_ref_lo,
+                        uint16x8_t *vec_sum_ref_hi) {
+  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
+  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
+
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
+                             vget_low_u8(vec_ref_00));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
+                             vget_high_u8(vec_ref_00));
+  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
+                             vget_low_u8(vec_ref_16));
+  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
+                             vget_high_u8(vec_ref_16));
+}
+
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 64; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
+    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
+
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 32; ++i) {
+    const uint8x16_t vec_src_00 = vld1q_u8(src);
+    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
+
+    sad_neon_32(vec_src_00, vec_src_16, ref0,
+                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref1,
+                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref2,
+                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
+    sad_neon_32(vec_src_00, vec_src_16, ref3,
+                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
+
+void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t* const ref[4], int ref_stride,
+                          uint32_t *res) {
+  int i;
+  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  for (i = 0; i < 16; ++i) {
+    const uint8x16_t vec_src = vld1q_u8(src);
+    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
+    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
+    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
+    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
+
+    vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref0));
+    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref0));
+    vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref1));
+    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref1));
+    vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref2));
+    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref2));
+    vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src),
+                               vget_low_u8(vec_ref3));
+    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
+                               vget_high_u8(vec_ref3));
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
+  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
+  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
+  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm
index 1b4f5cf3b0f..aed1d3a22ed 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/arm/armv6/vp8_sad16x16_armv6.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_media.asm
@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_sad16x16_armv6|
+    EXPORT  |vpx_sad16x16_media|
 
     ARM
     REQUIRE8
@@ -21,8 +21,7 @@
 ; r1    int  src_stride
 ; r2    const unsigned char *ref_ptr
 ; r3    int  ref_stride
-; stack max_sad (not used)
-|vp8_sad16x16_armv6| PROC
+|vpx_sad16x16_media| PROC
     stmfd   sp!, {r4-r12, lr}
 
     pld     [r0, r1, lsl #0]
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c
index c4cd856804d..173f08ac3c3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_sad_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c
@@ -9,11 +9,113 @@
  */
 
 #include <arm_neon.h>
-#include "./vp9_rtcd.h"
+
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
 
+unsigned int vpx_sad8x16_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 15; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
+unsigned int vpx_sad4x4_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x8_t d0, d8;
+    uint16x8_t q12;
+    uint32x2_t d1;
+    uint64x1_t d3;
+    int i;
+
+    d0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    d8 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(d0, d8);
+
+    for (i = 0; i < 3; i++) {
+        d0 = vld1_u8(src_ptr);
+        src_ptr += src_stride;
+        d8 = vld1_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, d0, d8);
+    }
+
+    d1 = vpaddl_u16(vget_low_u16(q12));
+    d3 = vpaddl_u32(d1);
+
+    return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+}
+
+unsigned int vpx_sad16x8_neon(
+        unsigned char *src_ptr,
+        int src_stride,
+        unsigned char *ref_ptr,
+        int ref_stride) {
+    uint8x16_t q0, q4;
+    uint16x8_t q12, q13;
+    uint32x4_t q1;
+    uint64x2_t q3;
+    uint32x2_t d5;
+    int i;
+
+    q0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    q4 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
+    q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
+
+    for (i = 0; i < 7; i++) {
+        q0 = vld1q_u8(src_ptr);
+        src_ptr += src_stride;
+        q4 = vld1q_u8(ref_ptr);
+        ref_ptr += ref_stride;
+        q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
+        q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
+    }
+
+    q12 = vaddq_u16(q12, q13);
+    q1 = vpaddlq_u16(q12);
+    q3 = vpaddlq_u32(q1);
+    d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
+                  vreinterpret_u32_u64(vget_high_u64(q3)));
+
+    return vget_lane_u32(d5, 0);
+}
+
 static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
                                                     const uint16x8_t vec_hi) {
   const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo),
@@ -34,7 +136,7 @@ static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
   return vget_lane_u32(c, 0);
 }
 
-unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -70,7 +172,7 @@ unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride,
   return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
 }
 
-unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -95,7 +197,7 @@ unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride,
   return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
 
-unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum_lo = vdupq_n_u16(0);
@@ -114,7 +216,7 @@ unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride,
   return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
 }
 
-unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride,
+unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride) {
   int i;
   uint16x8_t vec_accum = vdupq_n_u16(0);
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c
index 73134f2f2c0..9db312fbe05 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_sad.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/sad.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -10,15 +10,19 @@
 
 #include <stdlib.h>
 
-#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
+
 #if CONFIG_VP9_HIGHBITDEPTH
 #include "vp9/common/vp9_common.h"
-#endif
-#include "vp9/encoder/vp9_variance.h"
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+// Temporary ...
+#define ROUND_POWER_OF_TWO(value, n) \
+     (((value) + (1 << ((n) - 1))) >> (n))
 
+/* Sum the difference between every corresponding element of the buffers. */
 static INLINE unsigned int sad(const uint8_t *a, int a_stride,
                                const uint8_t *b, int b_stride,
                                int width, int height) {
@@ -35,35 +39,78 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
   return sad;
 }
 
+/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
+ * The function averages every corresponding element of the buffers and stores
+ * the value in a third buffer, comp_pred.
+ * pred and comp_pred are assumed to have stride = width
+ * In the usage below comp_pred is a local array.
+ */
+static INLINE void avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                                   int width, int height, const uint8_t *ref8,
+                                   int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #define sadMxN(m, n) \
-unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
                                   const uint8_t *ref, int ref_stride) { \
   return sad(src, src_stride, ref, ref_stride, m, n); \
 } \
-unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
                                       const uint8_t *ref, int ref_stride, \
                                       const uint8_t *second_pred) { \
   uint8_t comp_pred[m * n]; \
-  vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
   return sad(src, src_stride, comp_pred, m, m, n); \
 }
 
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
 #define sadMxNxK(m, n, k) \
-void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
-                                const uint8_t *ref, int ref_stride, \
-                                unsigned int *sads) { \
+void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                const uint8_t *ref_array, int ref_stride, \
+                                uint32_t *sad_array) { \
   int i; \
   for (i = 0; i < k; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
 }
 
+// This appears to be equivalent to the above when k == 4 and refs is const
 #define sadMxNx4D(m, n) \
-void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
-                             const uint8_t *const refs[], int ref_stride, \
-                             unsigned int *sads) { \
+void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                             const uint8_t *const ref_array[], int ref_stride, \
+                             uint32_t *sad_array) { \
   int i; \
   for (i = 0; i < 4; ++i) \
-    sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+    sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
 }
 
 // 64x64
@@ -169,40 +216,40 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
 }
 
 #define highbd_sadMxN(m, n) \
-unsigned int vp9_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
                                          const uint8_t *ref, int ref_stride) { \
   return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
 } \
-unsigned int vp9_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
+unsigned int vpx_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \
                                              int src_stride, \
                                              const uint8_t *ref, \
                                              int ref_stride, \
                                              const uint8_t *second_pred) { \
   uint16_t comp_pred[m * n]; \
-  vp9_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  highbd_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
   return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
 }
 
 #define highbd_sadMxNxK(m, n, k) \
-void vp9_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sads) { \
+void vpx_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref_array, int ref_stride, \
+                                       uint32_t *sad_array) { \
   int i; \
   for (i = 0; i < k; ++i) { \
-    sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, &ref[i], \
-                                          ref_stride); \
+    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, &ref_array[i], \
+                                               ref_stride); \
   } \
 }
 
 #define highbd_sadMxNx4D(m, n) \
-void vp9_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
-                                    const uint8_t *const refs[], \
-                                    int ref_stride, unsigned int *sads) { \
+void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                                    const uint8_t *const ref_array[], \
+                                    int ref_stride, uint32_t *sad_array) { \
   int i; \
   for (i = 0; i < 4; ++i) { \
-    sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, refs[i], \
-                                          ref_stride); \
-  }  \
+    sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride, ref_array[i], \
+                                               ref_stride); \
+  } \
 }
 
 // 64x64
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
new file mode 100644
index 00000000000..606515d2c19
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
@@ -0,0 +1,40 @@
+##
+## Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
+DSP_SRCS-yes += vpx_dsp.mk
+
+ifeq ($(CONFIG_ENCODERS),yes)
+DSP_SRCS-yes            += sad.c
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+
+DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_ENCODERS
+
+DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
+
+DSP_SRCS-yes += vpx_dsp_rtcd.c
+DSP_SRCS-yes += vpx_dsp_rtcd_defs.pl
+
+$(eval $(call rtcd_h_template,vpx_dsp_rtcd,vpx_dsp/vpx_dsp_rtcd_defs.pl))
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c
new file mode 100644
index 00000000000..5fe27b614bd
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c
@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#define RTCD_C
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/vpx_once.h"
+
+void vpx_dsp_rtcd() {
+  once(setup_rtcd_internal);
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
new file mode 100644
index 00000000000..ebec9ec0660
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -0,0 +1,395 @@
+sub vpx_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "vpx/vpx_integer.h"
+
+EOF
+}
+forward_decls qw/vpx_dsp_forward_decls/;
+
+# Functions which use x86inc.asm instead of x86_abi_support.asm
+if (vpx_config("CONFIG_USE_X86INC") eq "yes") {
+  $mmx_x86inc = 'mmx';
+  $sse_x86inc = 'sse';
+  $sse2_x86inc = 'sse2';
+  $ssse3_x86inc = 'ssse3';
+  $avx_x86inc = 'avx';
+  $avx2_x86inc = 'avx2';
+} else {
+  $mmx_x86inc = $sse_x86inc = $sse2_x86inc = $ssse3_x86inc =
+  $avx_x86inc = $avx2_x86inc = '';
+}
+
+# Functions which are 64 bit only.
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+} else {
+  $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 =
+  $avx_x86_64 = $avx2_x86_64 = '';
+}
+
+if (vpx_config("CONFIG_ENCODERS") eq "yes") {
+#
+# Single block SAD
+#
+add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x64 avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad64x32 avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x64 avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x32 avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad32x16 avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x32/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x16 mmx media neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad16x8 mmx neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x16 mmx neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x8 mmx neon/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad8x4/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x8/, "$sse_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad4x4 mmx neon/, "$sse_x86inc";
+
+#
+# Avg
+#
+add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x64_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad64x32_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x64_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x32_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad32x16_avg avx2/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x32_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad16x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x16_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x8_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad8x4_avg/, "$sse2_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x8_avg/, "$sse_x86inc";
+
+add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+specialize qw/vpx_sad4x4_avg/, "$sse_x86inc";
+
+#
+# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+#
+# Blocks of 3
+add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x3 sse3 ssse3/;
+
+add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x3 sse3 ssse3/;
+
+add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x3 sse3/;
+
+add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x3 sse3/;
+
+add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x3 sse3/;
+
+# Blocks of 8
+add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x8 sse4_1/;
+
+add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x8 sse4_1/;
+
+add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x8 sse4_1/;
+
+add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x8 sse4_1/;
+
+add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+
+add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x8 sse4_1/;
+
+#
+# Multi-block SAD, comparing a reference to N independent blocks
+#
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x4d avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x32x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x64x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x4d avx2 neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x16x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x32x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x16x4d neon/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad16x8x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x16x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x8x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x4x4d/, "$sse2_x86inc";
+
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x8x4d/, "$sse_x86inc";
+
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x4x4d/, "$sse_x86inc";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Single block SAD
+  #
+  add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x8/;
+
+  add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x4/;
+
+  #
+  # Avg
+  #
+  add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
+
+  add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x8_avg/;
+
+  add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x4_avg/;
+
+  #
+  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
+  #
+  # Blocks of 3
+  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x3/;
+
+  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x3/;
+
+  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x3/;
+
+  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x3/;
+
+  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x3/;
+
+  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x3/;
+
+  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x3/;
+
+  # Blocks of 8
+  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x8/;
+
+  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x8/;
+
+  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x8/;
+
+  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x8/;
+
+  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x8/;
+
+  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x8/;
+
+  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x4x8/;
+
+  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x8x8/;
+
+  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x8/;
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
+
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
+
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_ENCODERS
+
+1;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 00000000000..95cc4372ec3
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,289 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
+%macro HIGH_SADNXN4D 2
+%if UNIX64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+
+; set m1
+  push                srcq
+  mov                 srcd, 0x00010001
+  movd                  m1, srcd
+  pshufd                m1, m1, 0x0
+  pop                 srcq
+
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%rep (%2-4)/2
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 00000000000..4d422dde3af
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,365 @@
+;
+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+%define program_name vpx
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro HIGH_SAD_FN 4
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%else ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; avg/sad
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int vpx_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+
+
+; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+
+; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+  mov              n_rowsd, %1/2
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+
+
+; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+src_strideq*2]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*4]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*4]
+  por                   m3, m5
+  mova                  m5, [srcq+src_stride3q*2]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_stride3q*2]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c
index 1feed62566b..4128f2ac37c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -10,11 +10,11 @@
 #include <immintrin.h>  // AVX2
 #include "vpx/vpx_integer.h"
 
-void vp9_sad32x32x4d_avx2(uint8_t *src,
+void vpx_sad32x32x4d_avx2(uint8_t *src,
                           int src_stride,
                           uint8_t *ref[4],
                           int ref_stride,
-                          unsigned int res[4]) {
+                          uint32_t res[4]) {
   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m256i sum_mlow, sum_mhigh;
@@ -80,11 +80,11 @@ void vp9_sad32x32x4d_avx2(uint8_t *src,
   }
 }
 
-void vp9_sad64x64x4d_avx2(uint8_t *src,
+void vpx_sad64x64x4d_avx2(uint8_t *src,
                           int src_stride,
                           uint8_t *ref[4],
                           int ref_stride,
-                          unsigned int res[4]) {
+                          uint32_t res[4]) {
   __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
   __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
   __m256i ref3_reg, ref3next_reg;
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_sse2.asm
index b4936281f62..0f7fb93d47c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad4d_sse2.asm
@@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%define program_name vpx
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION .text
@@ -167,9 +169,9 @@ SECTION .text
   PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
 %endmacro
 
-; void vp9_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+; void vpx_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
-;                         unsigned int res[4]);
+;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
 %macro SADNXN4D 2
 %if UNIX64
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c
index 113193070e1..78536a47218 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_intrin_avx2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -11,7 +11,7 @@
 #include "vpx_ports/mem.h"
 
 #define FSAD64_H(h) \
-unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, \
                                   int src_stride, \
                                   const uint8_t *ref_ptr, \
                                   int ref_stride) { \
@@ -40,7 +40,7 @@ unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \
 }
 
 #define FSAD32_H(h) \
-unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, \
                                   int src_stride, \
                                   const uint8_t *ref_ptr, \
                                   int ref_stride) { \
@@ -89,7 +89,7 @@ FSAD32;
 #undef FSAD32_H
 
 #define FSADAVG64_H(h) \
-unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
                                       int src_stride, \
                                       const uint8_t *ref_ptr, \
                                       int  ref_stride, \
@@ -124,7 +124,7 @@ unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \
 }
 
 #define FSADAVG32_H(h) \
-unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
+unsigned int vpx_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \
                                       int src_stride, \
                                       const uint8_t *ref_ptr, \
                                       int  ref_stride, \
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_mmx.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_mmx.asm
index 592112fa91d..9968992bd13 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_mmx.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_mmx.asm
@@ -11,18 +11,18 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-global sym(vp8_sad16x16_mmx) PRIVATE
-global sym(vp8_sad8x16_mmx) PRIVATE
-global sym(vp8_sad8x8_mmx) PRIVATE
-global sym(vp8_sad4x4_mmx) PRIVATE
-global sym(vp8_sad16x8_mmx) PRIVATE
+global sym(vpx_sad16x16_mmx) PRIVATE
+global sym(vpx_sad8x16_mmx) PRIVATE
+global sym(vpx_sad8x8_mmx) PRIVATE
+global sym(vpx_sad4x4_mmx) PRIVATE
+global sym(vpx_sad16x8_mmx) PRIVATE
 
-;unsigned int vp8_sad16x16_mmx(
+;unsigned int vpx_sad16x16_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp8_sad16x16_mmx):
+sym(vpx_sad16x16_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -109,12 +109,12 @@ sym(vp8_sad16x16_mmx):
     ret
 
 
-;unsigned int vp8_sad8x16_mmx(
+;unsigned int vpx_sad8x16_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp8_sad8x16_mmx):
+sym(vpx_sad8x16_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -181,12 +181,12 @@ sym(vp8_sad8x16_mmx):
     ret
 
 
-;unsigned int vp8_sad8x8_mmx(
+;unsigned int vpx_sad8x8_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp8_sad8x8_mmx):
+sym(vpx_sad8x8_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -251,12 +251,12 @@ sym(vp8_sad8x8_mmx):
     ret
 
 
-;unsigned int vp8_sad4x4_mmx(
+;unsigned int vpx_sad4x4_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp8_sad4x4_mmx):
+sym(vpx_sad4x4_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
@@ -340,12 +340,12 @@ sym(vp8_sad4x4_mmx):
     ret
 
 
-;unsigned int vp8_sad16x8_mmx(
+;unsigned int vpx_sad16x8_mmx(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
-sym(vp8_sad16x8_mmx):
+sym(vpx_sad16x8_mmx):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 4
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse2.asm
index c4c5c54f0e4..c6a829dc21e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse2.asm
@@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%define program_name vpx
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION .text
@@ -44,7 +46,7 @@ cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
 %endif ; %3 == 7
 %endmacro
 
-; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
@@ -87,7 +89,7 @@ SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
 
-; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
@@ -132,7 +134,7 @@ SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
 
-; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
@@ -178,7 +180,7 @@ SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
 
-; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
@@ -222,7 +224,7 @@ SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
 
-; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
+; unsigned int vpx_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
 ;                                  uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm
index 2b90a5d5478..18279bdb9de 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse3.asm
@@ -19,7 +19,6 @@
   %define     end_ptr       rcx
   %define     ret_var       rbx
   %define     result_ptr    arg(4)
-  %define     max_err       arg(4)
   %define     height        dword ptr arg(4)
     push        rbp
     mov         rbp,        rsp
@@ -42,7 +41,6 @@
     %define     end_ptr     r10
     %define     ret_var     r11
     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     max_err     [rsp+xmm_stack_space+8+4*8]
     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
   %else
     %define     src_ptr     rdi
@@ -52,7 +50,6 @@
     %define     end_ptr     r9
     %define     ret_var     r10
     %define     result_ptr  r8
-    %define     max_err     r8
     %define     height      r8
   %endif
 %endif
@@ -67,7 +64,6 @@
   %define     end_ptr
   %define     ret_var
   %define     result_ptr
-  %define     max_err
   %define     height
 
 %if ABI_IS_32BIT
@@ -169,14 +165,14 @@
         paddw           mm7,       mm3
 %endmacro
 
-;void int vp9_sad16x16x3_sse3(
+;void int vpx_sad16x16x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x16x3_sse3) PRIVATE
-sym(vp9_sad16x16x3_sse3):
+global sym(vpx_sad16x16x3_sse3) PRIVATE
+sym(vpx_sad16x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -211,14 +207,14 @@ sym(vp9_sad16x16x3_sse3):
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad16x8x3_sse3(
+;void int vpx_sad16x8x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad16x8x3_sse3) PRIVATE
-sym(vp9_sad16x8x3_sse3):
+global sym(vpx_sad16x8x3_sse3) PRIVATE
+sym(vpx_sad16x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -249,14 +245,14 @@ sym(vp9_sad16x8x3_sse3):
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad8x16x3_sse3(
+;void int vpx_sad8x16x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x16x3_sse3) PRIVATE
-sym(vp9_sad8x16x3_sse3):
+global sym(vpx_sad8x16x3_sse3) PRIVATE
+sym(vpx_sad8x16x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -278,14 +274,14 @@ sym(vp9_sad8x16x3_sse3):
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad8x8x3_sse3(
+;void int vpx_sad8x8x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad8x8x3_sse3) PRIVATE
-sym(vp9_sad8x8x3_sse3):
+global sym(vpx_sad8x8x3_sse3) PRIVATE
+sym(vpx_sad8x8x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
@@ -303,14 +299,14 @@ sym(vp9_sad8x8x3_sse3):
 
     STACK_FRAME_DESTROY_X3
 
-;void int vp9_sad4x4x3_sse3(
+;void int vpx_sad4x4x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp9_sad4x4x3_sse3) PRIVATE
-sym(vp9_sad4x4x3_sse3):
+global sym(vpx_sad4x4x3_sse3) PRIVATE
+sym(vpx_sad4x4x3_sse3):
 
     STACK_FRAME_CREATE_X3
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm
index faf1768a983..bc674479715 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_sad_sse4.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_sse4.asm
@@ -165,14 +165,14 @@
     movdqa          [rdi + 16],    xmm2
 %endmacro
 
-;void vp9_sad16x16x8_sse4(
+;void vpx_sad16x16x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array);
-global sym(vp9_sad16x16x8_sse4) PRIVATE
-sym(vp9_sad16x16x8_sse4):
+global sym(vpx_sad16x16x8_sse4_1) PRIVATE
+sym(vpx_sad16x16x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -205,15 +205,15 @@ sym(vp9_sad16x16x8_sse4):
     ret
 
 
-;void vp9_sad16x8x8_sse4(
+;void vpx_sad16x8x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad16x8x8_sse4) PRIVATE
-sym(vp9_sad16x8x8_sse4):
+global sym(vpx_sad16x8x8_sse4_1) PRIVATE
+sym(vpx_sad16x8x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -242,15 +242,15 @@ sym(vp9_sad16x8x8_sse4):
     ret
 
 
-;void vp9_sad8x8x8_sse4(
+;void vpx_sad8x8x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad8x8x8_sse4) PRIVATE
-sym(vp9_sad8x8x8_sse4):
+global sym(vpx_sad8x8x8_sse4_1) PRIVATE
+sym(vpx_sad8x8x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -279,15 +279,15 @@ sym(vp9_sad8x8x8_sse4):
     ret
 
 
-;void vp9_sad8x16x8_sse4(
+;void vpx_sad8x16x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad8x16x8_sse4) PRIVATE
-sym(vp9_sad8x16x8_sse4):
+global sym(vpx_sad8x16x8_sse4_1) PRIVATE
+sym(vpx_sad8x16x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -320,15 +320,15 @@ sym(vp9_sad8x16x8_sse4):
     ret
 
 
-;void vp9_sad4x4x8_c(
+;void vpx_sad4x4x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
 ;    const unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    unsigned short *sad_array
 ;);
-global sym(vp9_sad4x4x8_sse4) PRIVATE
-sym(vp9_sad4x4x8_sse4):
+global sym(vpx_sad4x4x8_sse4_1) PRIVATE
+sym(vpx_sad4x4x8_sse4_1):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_ssse3.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm
index 278fc0640ed..49f204fa04b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vp8/common/x86/sad_ssse3.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/sad_ssse3.asm
@@ -146,14 +146,14 @@
 
 %endmacro
 
-;void int vp8_sad16x16x3_ssse3(
+;void int vpx_sad16x16x3_ssse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x16x3_ssse3) PRIVATE
-sym(vp8_sad16x16x3_ssse3):
+global sym(vpx_sad16x16x3_ssse3) PRIVATE
+sym(vpx_sad16x16x3_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -169,31 +169,31 @@ sym(vp8_sad16x16x3_ssse3):
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp .vp8_sad16x16x3_ssse3_skiptable
-.vp8_sad16x16x3_ssse3_jumptable:
-        dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
-        dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
-.vp8_sad16x16x3_ssse3_skiptable:
-
-        call .vp8_sad16x16x3_ssse3_do_jump
-.vp8_sad16x16x3_ssse3_do_jump:
+        jmp .vpx_sad16x16x3_ssse3_skiptable
+.vpx_sad16x16x3_ssse3_jumptable:
+        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
+        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_skiptable:
+
+        call .vpx_sad16x16x3_ssse3_do_jump
+.vpx_sad16x16x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
         add             rcx,        rax
@@ -203,23 +203,23 @@ sym(vp8_sad16x16x3_ssse3):
 
         jmp             rcx
 
-        PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
-
-.vp8_sad16x16x3_ssse3_aligned_by_15:
+        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
+
+.vpx_sad16x16x3_ssse3_aligned_by_15:
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
@@ -229,7 +229,7 @@ sym(vp8_sad16x16x3_ssse3):
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-.vp8_sad16x16x3_ssse3_store_off:
+.vpx_sad16x16x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5
@@ -259,14 +259,14 @@ sym(vp8_sad16x16x3_ssse3):
     pop         rbp
     ret
 
-;void int vp8_sad16x8x3_ssse3(
+;void int vpx_sad16x8x3_ssse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride,
 ;    int  *results)
-global sym(vp8_sad16x8x3_ssse3) PRIVATE
-sym(vp8_sad16x8x3_ssse3):
+global sym(vpx_sad16x8x3_ssse3) PRIVATE
+sym(vpx_sad16x8x3_ssse3):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
@@ -282,31 +282,31 @@ sym(vp8_sad16x8x3_ssse3):
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp .vp8_sad16x8x3_ssse3_skiptable
-.vp8_sad16x8x3_ssse3_jumptable:
-        dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
-        dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
-.vp8_sad16x8x3_ssse3_skiptable:
-
-        call .vp8_sad16x8x3_ssse3_do_jump
-.vp8_sad16x8x3_ssse3_do_jump:
+        jmp .vpx_sad16x8x3_ssse3_skiptable
+.vpx_sad16x8x3_ssse3_jumptable:
+        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
+        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_skiptable:
+
+        call .vpx_sad16x8x3_ssse3_do_jump
+.vpx_sad16x8x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
         add             rcx,        rax
@@ -316,30 +316,30 @@ sym(vp8_sad16x8x3_ssse3):
 
         jmp             rcx
 
-        PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
-
-.vp8_sad16x8x3_ssse3_aligned_by_15:
+        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
+
+.vpx_sad16x8x3_ssse3_aligned_by_15:
 
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-.vp8_sad16x8x3_ssse3_store_off:
+.vpx_sad16x8x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h
index 225a3babfe3..c4dd78550f3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_intrnl.h
@@ -13,35 +13,6 @@
 #define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
 #include "./vpx_config.h"
 
-#ifndef CONFIG_MEM_MANAGER
-# if defined(VXWORKS)
-#  define CONFIG_MEM_MANAGER  1 /*include heap manager functionality,*/
-/*default: enabled on vxworks*/
-# else
-#  define CONFIG_MEM_MANAGER  0 /*include heap manager functionality*/
-# endif
-#endif /*CONFIG_MEM_MANAGER*/
-
-#ifndef CONFIG_MEM_TRACKER
-# define CONFIG_MEM_TRACKER     1 /*include xvpx_* calls in the lib*/
-#endif
-
-#ifndef CONFIG_MEM_CHECKS
-# define CONFIG_MEM_CHECKS      0 /*include some basic safety checks in
-vpx_memcpy, _memset, and _memmove*/
-#endif
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS   0  /*use function pointers instead of compiled functions.*/
-#endif
-
-#if CONFIG_MEM_TRACKER
-# include "vpx_mem_tracker.h"
-# if VPX_MEM_TRACKER_VERSION_CHIEF != 2 || VPX_MEM_TRACKER_VERSION_MAJOR != 5
-#  error "vpx_mem requires memory tracker version 2.5 to track memory usage"
-# endif
-#endif
-
 #define ADDRESS_STORAGE_SIZE      sizeof(size_t)
 
 #ifndef DEFAULT_ALIGNMENT
@@ -54,41 +25,6 @@ than vpx_memalign*/
 # endif
 #endif
 
-#if CONFIG_MEM_TRACKER
-# define TRY_BOUNDS_CHECK         1        /*when set to 1 pads each allocation,
-integrity can be checked using
-vpx_memory_tracker_check_integrity
-or on free by defining*/
-/*TRY_BOUNDS_CHECK_ON_FREE*/
-#else
-# define TRY_BOUNDS_CHECK         0
-#endif /*CONFIG_MEM_TRACKER*/
-
-#if TRY_BOUNDS_CHECK
-# define TRY_BOUNDS_CHECK_ON_FREE 0          /*checks mem integrity on every
-free, very expensive*/
-# define BOUNDS_CHECK_VALUE       0xdeadbeef /*value stored before/after ea.
-mem addr for bounds checking*/
-# define BOUNDS_CHECK_PAD_SIZE    32         /*size of the padding before and
-after ea allocation to be filled
-with BOUNDS_CHECK_VALUE.
-this should be a multiple of 4*/
-#else
-# define BOUNDS_CHECK_VALUE       0
-# define BOUNDS_CHECK_PAD_SIZE    0
-#endif /*TRY_BOUNDS_CHECK*/
-
-#ifndef REMOVE_PRINTFS
-# define REMOVE_PRINTFS 0
-#endif
-
-/* Should probably use a vpx_mem logger function. */
-#if REMOVE_PRINTFS
-# define _P(x)
-#else
-# define _P(x) x
-#endif
-
 /*returns an addr aligned to the byte boundary specified by align*/
 #define align_addr(addr,align) (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_tracker.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_tracker.h
deleted file mode 100644
index 1335e0017b3..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/include/vpx_mem_tracker.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
-#define VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
-
-/* vpx_mem_tracker version info */
-#define vpx_mem_tracker_version "2.5.1.1"
-
-#define VPX_MEM_TRACKER_VERSION_CHIEF 2
-#define VPX_MEM_TRACKER_VERSION_MAJOR 5
-#define VPX_MEM_TRACKER_VERSION_MINOR 1
-#define VPX_MEM_TRACKER_VERSION_PATCH 1
-/* END - vpx_mem_tracker version info */
-
-#include <stdarg.h>
-
-struct mem_block {
-  size_t addr;
-  unsigned int size,
-           line;
-  char *file;
-  struct mem_block *prev,
-      * next;
-
-  int padded; // This mem_block has padding for integrity checks.
-  // As of right now, this should only be 0 if
-  // using vpx_mem_alloc to allocate cache memory.
-  // 2005-01-11 tjf
-};
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-  /*
-      vpx_memory_tracker_init(int padding_size, int pad_value)
-        padding_size - the size of the padding before and after each mem addr.
-                       Values > 0 indicate that integrity checks can be performed
-                       by inspecting these areas.
-        pad_value - the initial value within the padding area before and after
-                    each mem addr.
-
-      Initializes the memory tracker interface. Should be called before any
-      other calls to the memory tracker.
-  */
-  int vpx_memory_tracker_init(int padding_size, int pad_value);
-
-  /*
-      vpx_memory_tracker_destroy()
-      Deinitializes the memory tracker interface
-  */
-  void vpx_memory_tracker_destroy();
-
-  /*
-      vpx_memory_tracker_add(size_t addr, unsigned int size,
-                           char * file, unsigned int line)
-        addr - memory address to be added to list
-        size - size of addr
-        file - the file addr was referenced from
-        line - the line in file addr was referenced from
-      Adds memory address addr, it's size, file and line it came from
-      to the memory tracker allocation table
-  */
-  void vpx_memory_tracker_add(size_t addr, unsigned int size,
-                              char *file, unsigned int line,
-                              int padded);
-
-  /*
-      vpx_memory_tracker_add(size_t addr, unsigned int size, char * file, unsigned int line)
-        addr - memory address to be added to be removed
-        padded - if 0, disables bounds checking on this memory block even if bounds
-        checking is enabled. (for example, when allocating cache memory, we still want
-        to check for memory leaks, but we do not waste cache space for bounds check padding)
-      Removes the specified address from the memory tracker's allocation
-      table
-      Return:
-        0: on success
-        -1: if memory allocation table's mutex could not be locked
-        -2: if the addr was not found in the list
-  */
-  int vpx_memory_tracker_remove(size_t addr);
-
-  /*
-      vpx_memory_tracker_find(unsigned int addr)
-        addr - address to be found in the memory tracker's
-               allocation table
-      Return:
-          If found, pointer to the memory block that matches addr
-          NULL otherwise
-  */
-  struct mem_block *vpx_memory_tracker_find(size_t addr);
-
-  /*
-      vpx_memory_tracker_dump()
-      Dumps the current contents of the memory
-      tracker allocation table
-  */
-  void vpx_memory_tracker_dump();
-
-  /*
-      vpx_memory_tracker_check_integrity()
-      If a padding_size was provided to vpx_memory_tracker_init()
-      This function will verify that the region before and after each
-      memory address contains the specified pad_value. Should the check
-      fail, the filename and line of the check will be printed out.
-  */
-  void vpx_memory_tracker_check_integrity(char *file, unsigned int line);
-
-  /*
-      vpx_memory_tracker_set_log_type
-        type - value representing the logging type to use
-        option - type specific option. This will be interpreted differently
-                 based on the type.
-      Sets the logging type for the memory tracker.
-      Values currently supported:
-        0: if option is NULL, log to stderr, otherwise interpret option as a
-           filename and attempt to open it.
-        1: Use output_debug_string (WIN32 only), option ignored
-      Return:
-        0: on success
-        -1: if the logging type could not be set, because the value was invalid
-            or because a file could not be opened
-  */
-  int vpx_memory_tracker_set_log_type(int type, char *option);
-
-  /*
-      vpx_memory_tracker_set_log_func
-        userdata - ptr to be passed to the supplied logfunc, can be NULL
-        logfunc - the logging function to be used to output data from
-                  vpx_memory_track_dump/check_integrity
-      Sets a logging function to be used by the memory tracker.
-      Return:
-        0: on success
-        -1: if the logging type could not be set because logfunc was NULL
-  */
-  int vpx_memory_tracker_set_log_func(void *userdata,
-                                      void(*logfunc)(void *userdata,
-                                                     const char *fmt, va_list args));
-
-  /* Wrappers to standard library functions. */
-  typedef void *(* mem_track_malloc_func)(size_t);
-  typedef void *(* mem_track_calloc_func)(size_t, size_t);
-  typedef void *(* mem_track_realloc_func)(void *, size_t);
-  typedef void (* mem_track_free_func)(void *);
-  typedef void *(* mem_track_memcpy_func)(void *, const void *, size_t);
-  typedef void *(* mem_track_memset_func)(void *, int, size_t);
-  typedef void *(* mem_track_memmove_func)(void *, const void *, size_t);
-
-  /*
-      vpx_memory_tracker_set_functions
-
-      Sets the function pointers for the standard library functions.
-
-      Return:
-        0: on success
-        -1: if the use global function pointers is not set.
-  */
-  int vpx_memory_tracker_set_functions(mem_track_malloc_func g_malloc_l
-, mem_track_calloc_func g_calloc_l
-, mem_track_realloc_func g_realloc_l
-, mem_track_free_func g_free_l
-, mem_track_memcpy_func g_memcpy_l
-, mem_track_memset_func g_memset_l
-, mem_track_memmove_func g_memmove_l);
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif  // VPX_MEM_INCLUDE_VPX_MEM_TRACKER_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_alloc.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_alloc.c
deleted file mode 100644
index ab3562dfb3d..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_alloc.c
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void *U(alloc)(U(descriptor) *desc, U(size_aau) n) {
-#ifdef HMM_AUDIT_FAIL
-
-  if (desc->avl_tree_root)
-    AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-    if (desc->last_freed) {
-#ifdef HMM_AUDIT_FAIL
-      AUDIT_BLOCK(desc->last_freed)
-#endif
-
-      U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-
-      desc->last_freed = 0;
-    }
-
-  /* Add space for block header. */
-  n += HEAD_AAUS;
-
-  /* Convert n from number of address alignment units to block alignment
-  ** units. */
-  n = DIV_ROUND_UP(n, HMM_BLOCK_ALIGN_UNIT);
-
-  if (n < MIN_BLOCK_BAUS)
-    n = MIN_BLOCK_BAUS;
-
-  {
-    /* Search for the first node of the bin containing the smallest
-    ** block big enough to satisfy request. */
-    ptr_record *ptr_rec_ptr =
-      U(avl_search)(
-        (U(avl_avl) *) & (desc->avl_tree_root), (U(size_bau)) n,
-        AVL_GREATER_EQUAL);
-
-    /* If an approprate bin is found, satisfy the allocation request,
-    ** otherwise return null pointer. */
-    return(ptr_rec_ptr ?
-           U(alloc_from_bin)(desc, ptr_rec_ptr, (U(size_bau)) n) : 0);
-  }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_base.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_base.c
deleted file mode 100644
index 0eff59d20e9..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_base.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(init)(U(descriptor) *desc) {
-  desc->avl_tree_root = 0;
-  desc->last_freed = 0;
-}
-
-/* Remove a free block from a bin's doubly-linked list when it is not,
-** the first block in the bin.
-*/
-void U(dll_remove)(
-  /* Pointer to pointer record in the block to be removed. */
-  ptr_record *to_remove) {
-  to_remove->prev->next = to_remove->next;
-
-  if (to_remove->next)
-    to_remove->next->prev = to_remove->prev;
-}
-
-/* Put a block into the free collection of a heap.
-*/
-void U(into_free_collection)(
-  /* Pointer to heap descriptor. */
-  U(descriptor) *desc,
-  /* Pointer to head record of block. */
-  head_record *head_ptr) {
-  ptr_record *ptr_rec_ptr = HEAD_TO_PTR_REC(head_ptr);
-
-  ptr_record *bin_front_ptr =
-    U(avl_insert)((U(avl_avl) *) & (desc->avl_tree_root), ptr_rec_ptr);
-
-  if (bin_front_ptr != ptr_rec_ptr) {
-    /* The block was not inserted into the AVL tree because there is
-    ** already a bin for the size of the block. */
-
-    MARK_SUCCESSIVE_BLOCK_IN_FREE_BIN(head_ptr)
-    ptr_rec_ptr->self = ptr_rec_ptr;
-
-    /* Make the block the new second block in the bin's doubly-linked
-    ** list. */
-    ptr_rec_ptr->prev = bin_front_ptr;
-    ptr_rec_ptr->next = bin_front_ptr->next;
-    bin_front_ptr->next = ptr_rec_ptr;
-
-    if (ptr_rec_ptr->next)
-      ptr_rec_ptr->next->prev = ptr_rec_ptr;
-  } else
-    /* Block is first block in new bin. */
-    ptr_rec_ptr->next = 0;
-}
-
-/* Allocate a block from a given bin.  Returns a pointer to the payload
-** of the removed block.  The "last freed" pointer must be null prior
-** to calling this function.
-*/
-void *U(alloc_from_bin)(
-  /* Pointer to heap descriptor. */
-  U(descriptor) *desc,
-  /* Pointer to pointer record of first block in bin. */
-  ptr_record *bin_front_ptr,
-  /* Number of BAUs needed in the allocated block.  If the block taken
-  ** from the bin is significantly larger than the number of BAUs needed,
-  ** the "extra" BAUs are split off to form a new free block. */
-  U(size_bau) n_baus) {
-  head_record *head_ptr;
-  U(size_bau) rem_baus;
-
-  if (bin_front_ptr->next) {
-    /* There are multiple blocks in this bin.  Use the 2nd block in
-    ** the bin to avoid needless change to the AVL tree.
-    */
-
-    ptr_record *ptr_rec_ptr = bin_front_ptr->next;
-    head_ptr = PTR_REC_TO_HEAD(ptr_rec_ptr);
-
-#ifdef AUDIT_FAIL
-    AUDIT_BLOCK(head_ptr)
-#endif
-
-    U(dll_remove)(ptr_rec_ptr);
-  } else {
-    /* There is only one block in the bin, so it has to be removed
-    ** from the AVL tree.
-    */
-
-    head_ptr = PTR_REC_TO_HEAD(bin_front_ptr);
-
-    U(avl_remove)(
-      (U(avl_avl) *) & (desc->avl_tree_root), BLOCK_BAUS(head_ptr));
-  }
-
-  MARK_BLOCK_ALLOCATED(head_ptr)
-
-  rem_baus = BLOCK_BAUS(head_ptr) - n_baus;
-
-  if (rem_baus >= MIN_BLOCK_BAUS) {
-    /* Since there are enough "extra" BAUs, split them off to form
-    ** a new free block.
-    */
-
-    head_record *rem_head_ptr =
-      (head_record *) BAUS_FORWARD(head_ptr, n_baus);
-
-    /* Change the next block's header to reflect the fact that the
-    ** block preceeding it is now smaller.
-    */
-    SET_PREV_BLOCK_BAUS(
-      BAUS_FORWARD(head_ptr, head_ptr->block_size), rem_baus)
-
-    head_ptr->block_size = n_baus;
-
-    rem_head_ptr->previous_block_size = n_baus;
-    rem_head_ptr->block_size = rem_baus;
-
-    desc->last_freed = rem_head_ptr;
-  }
-
-  return(HEAD_TO_PTR_REC(head_ptr));
-}
-
-/* Take a block out of the free collection.
-*/
-void U(out_of_free_collection)(
-  /* Descriptor of heap that block is in. */
-  U(descriptor) *desc,
-  /* Pointer to head of block to take out of free collection. */
-  head_record *head_ptr) {
-  ptr_record *ptr_rec_ptr = HEAD_TO_PTR_REC(head_ptr);
-
-  if (ptr_rec_ptr->self == ptr_rec_ptr)
-    /* Block is not the front block in its bin, so all we have to
-    ** do is take it out of the bin's doubly-linked list. */
-    U(dll_remove)(ptr_rec_ptr);
-  else {
-    ptr_record *next = ptr_rec_ptr->next;
-
-    if (next)
-      /* Block is the front block in its bin, and there is at least
-      ** one other block in the bin.  Substitute the next block for
-      ** the front block. */
-      U(avl_subst)((U(avl_avl) *) & (desc->avl_tree_root), next);
-    else
-      /* Block is the front block in its bin, but there is no other
-      ** block in the bin.  Eliminate the bin. */
-      U(avl_remove)(
-        (U(avl_avl) *) & (desc->avl_tree_root), BLOCK_BAUS(head_ptr));
-  }
-}
-
-void U(free)(U(descriptor) *desc, void *payload_ptr) {
-  /* Flags if coalesce with adjacent block. */
-  int coalesce;
-
-  head_record *fwd_head_ptr;
-  head_record *free_head_ptr = PTR_REC_TO_HEAD(payload_ptr);
-
-  desc->num_baus_can_shrink = 0;
-
-#ifdef HMM_AUDIT_FAIL
-
-  AUDIT_BLOCK(free_head_ptr)
-
-  /* Make sure not freeing an already free block. */
-  if (!IS_BLOCK_ALLOCATED(free_head_ptr))
-    HMM_AUDIT_FAIL
-
-    if (desc->avl_tree_root)
-      /* Audit root block in AVL tree. */
-      AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-
-#endif
-
-      fwd_head_ptr =
-        (head_record *) BAUS_FORWARD(free_head_ptr, free_head_ptr->block_size);
-
-  if (free_head_ptr->previous_block_size) {
-    /* Coalesce with backward block if possible. */
-
-    head_record *bkwd_head_ptr =
-      (head_record *) BAUS_BACKWARD(
-        free_head_ptr, free_head_ptr->previous_block_size);
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(bkwd_head_ptr)
-#endif
-
-    if (bkwd_head_ptr == (head_record *)(desc->last_freed)) {
-      desc->last_freed = 0;
-      coalesce = 1;
-    } else if (IS_BLOCK_ALLOCATED(bkwd_head_ptr))
-      coalesce = 0;
-    else {
-      U(out_of_free_collection)(desc, bkwd_head_ptr);
-      coalesce = 1;
-    }
-
-    if (coalesce) {
-      bkwd_head_ptr->block_size += free_head_ptr->block_size;
-      SET_PREV_BLOCK_BAUS(fwd_head_ptr, BLOCK_BAUS(bkwd_head_ptr))
-      free_head_ptr = bkwd_head_ptr;
-    }
-  }
-
-  if (fwd_head_ptr->block_size == 0) {
-    /* Block to be freed is last block before dummy end-of-chunk block. */
-    desc->end_of_shrinkable_chunk =
-      BAUS_FORWARD(fwd_head_ptr, DUMMY_END_BLOCK_BAUS);
-    desc->num_baus_can_shrink = BLOCK_BAUS(free_head_ptr);
-
-    if (PREV_BLOCK_BAUS(free_head_ptr) == 0)
-      /* Free block is the entire chunk, so shrinking can eliminate
-      ** entire chunk including dummy end block. */
-      desc->num_baus_can_shrink += DUMMY_END_BLOCK_BAUS;
-  } else {
-    /* Coalesce with forward block if possible. */
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(fwd_head_ptr)
-#endif
-
-    if (fwd_head_ptr == (head_record *)(desc->last_freed)) {
-      desc->last_freed = 0;
-      coalesce = 1;
-    } else if (IS_BLOCK_ALLOCATED(fwd_head_ptr))
-      coalesce = 0;
-    else {
-      U(out_of_free_collection)(desc, fwd_head_ptr);
-      coalesce = 1;
-    }
-
-    if (coalesce) {
-      free_head_ptr->block_size += fwd_head_ptr->block_size;
-
-      fwd_head_ptr =
-        (head_record *) BAUS_FORWARD(
-          fwd_head_ptr, BLOCK_BAUS(fwd_head_ptr));
-
-      SET_PREV_BLOCK_BAUS(fwd_head_ptr, BLOCK_BAUS(free_head_ptr))
-
-      if (fwd_head_ptr->block_size == 0) {
-        /* Coalesced block to be freed is last block before dummy
-        ** end-of-chunk block. */
-        desc->end_of_shrinkable_chunk =
-          BAUS_FORWARD(fwd_head_ptr, DUMMY_END_BLOCK_BAUS);
-        desc->num_baus_can_shrink = BLOCK_BAUS(free_head_ptr);
-
-        if (PREV_BLOCK_BAUS(free_head_ptr) == 0)
-          /* Free block is the entire chunk, so shrinking can
-          ** eliminate entire chunk including dummy end block. */
-          desc->num_baus_can_shrink += DUMMY_END_BLOCK_BAUS;
-      }
-    }
-  }
-
-  if (desc->last_freed) {
-    /* There is a last freed block, but it is not adjacent to the
-    ** block being freed by this call to free, so put the last
-    ** freed block into the free collection.
-    */
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(desc->last_freed)
-#endif
-
-    U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-  }
-
-  desc->last_freed = free_head_ptr;
-}
-
-void U(new_chunk)(U(descriptor) *desc, void *start, U(size_bau) n_baus) {
-#ifdef HMM_AUDIT_FAIL
-
-  if (desc->avl_tree_root)
-    /* Audit root block in AVL tree. */
-    AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-#undef HEAD_PTR
-#define HEAD_PTR ((head_record *) start)
-
-    /* Make the chunk one big free block followed by a dummy end block.
-    */
-
-    n_baus -= DUMMY_END_BLOCK_BAUS;
-
-  HEAD_PTR->previous_block_size = 0;
-  HEAD_PTR->block_size = n_baus;
-
-  U(into_free_collection)(desc, HEAD_PTR);
-
-  /* Set up the dummy end block. */
-  start = BAUS_FORWARD(start, n_baus);
-  HEAD_PTR->previous_block_size = n_baus;
-  HEAD_PTR->block_size = 0;
-
-#undef HEAD_PTR
-}
-
-#ifdef HMM_AUDIT_FAIL
-
-/* Function that does audit fail actions defined my preprocessor symbol,
-** and returns a dummy integer value.
-*/
-int U(audit_block_fail_dummy_return)(void) {
-  HMM_AUDIT_FAIL
-
-  /* Dummy return. */
-  return(0);
-}
-
-#endif
-
-/* AVL Tree instantiation. */
-
-#ifdef HMM_AUDIT_FAIL
-
-/* The AVL tree generic package passes an ACCESS of 1 when it "touches"
-** a child node for the first time during a particular operation.  I use
-** this feature to audit only one time (per operation) the free blocks
-** that are tree nodes.  Since the root node is not a child node, it has
-** to be audited directly.
-*/
-
-/* The pain you feel while reading these macros will not be in vain.  It
-** will remove all doubt from you mind that C++ inline functions are
-** a very good thing.
-*/
-
-#define AVL_GET_LESS(H, ACCESS) \
-  (((ACCESS) ? AUDIT_BLOCK_AS_EXPR(PTR_REC_TO_HEAD(H)) : 0), (H)->self)
-#define AVL_GET_GREATER(H, ACCESS) \
-  (((ACCESS) ? AUDIT_BLOCK_AS_EXPR(PTR_REC_TO_HEAD(H)) : 0), (H)->prev)
-
-#else
-
-#define AVL_GET_LESS(H, ACCESS) ((H)->self)
-#define AVL_GET_GREATER(H, ACCESS) ((H)->prev)
-
-#endif
-
-#define AVL_SET_LESS(H, LH) (H)->self = (LH);
-#define AVL_SET_GREATER(H, GH) (H)->prev = (GH);
-
-/*  high bit of high bit of
-**  block_size  previous_block_size balance factor
-**  ----------- ------------------- --------------
-**  0       0           n/a (block allocated)
-**  0       1           1
-**  1       0           -1
-**  1       1           0
-*/
-
-#define AVL_GET_BALANCE_FACTOR(H) \
-  ((((head_record *) (PTR_REC_TO_HEAD(H)))->block_size & \
-    HIGH_BIT_BAU_SIZE) ? \
-   (((head_record *) (PTR_REC_TO_HEAD(H)))->previous_block_size & \
-    HIGH_BIT_BAU_SIZE ? 0 : -1) : 1)
-
-#define AVL_SET_BALANCE_FACTOR(H, BF) \
-  {                         \
-    register head_record *p =               \
-                                            (head_record *) PTR_REC_TO_HEAD(H);       \
-    register int bal_f = (BF);              \
-    \
-    if (bal_f <= 0)                 \
-      p->block_size |= HIGH_BIT_BAU_SIZE;       \
-    else                        \
-      p->block_size &= ~HIGH_BIT_BAU_SIZE;      \
-    if (bal_f >= 0)                 \
-      p->previous_block_size |= HIGH_BIT_BAU_SIZE;  \
-    else                        \
-      p->previous_block_size &= ~HIGH_BIT_BAU_SIZE; \
-  }
-
-#define COMPARE_KEY_KEY(K1, K2) ((K1) == (K2) ? 0 : ((K1) > (K2) ? 1 : -1))
-
-#define AVL_COMPARE_KEY_NODE(K, H) \
-  COMPARE_KEY_KEY(K, BLOCK_BAUS(PTR_REC_TO_HEAD(H)))
-
-#define AVL_COMPARE_NODE_NODE(H1, H2) \
-  COMPARE_KEY_KEY(BLOCK_BAUS(PTR_REC_TO_HEAD(H1)), \
-                  BLOCK_BAUS(PTR_REC_TO_HEAD(H2)))
-
-#define AVL_NULL ((ptr_record *) 0)
-
-#define AVL_IMPL_MASK \
-  ( AVL_IMPL_INSERT | AVL_IMPL_SEARCH | AVL_IMPL_REMOVE | AVL_IMPL_SUBST )
-
-#include "cavl_impl.h"
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c
deleted file mode 100644
index 51c3cc27a4a..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_dflt_abort.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-/* The function in this file performs default actions if self-auditing
-** finds heap corruption.  Don't rely on this code to handle the
-** case where HMM is being used to implement the malloc and free standard
-** library functions.  Rewrite the function if necessary to avoid using
-** I/O and execution termination functions that call malloc or free.
-** In Unix, for example, you would replace the fputs calls with calls
-** to the write system call using file handle number 2.
-*/
-#include "hmm_intrnl.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-static int entered = 0;
-
-/* Print abort message, file and line.  Terminate execution.
-*/
-void hmm_dflt_abort(const char *file, const char *line) {
-  /* Avoid use of printf(), which is more likely to use heap. */
-
-  if (entered)
-
-    /* The standard I/O functions called a heap function and caused
-    ** an indirect recursive call to this function.  So we'll have
-    ** to just exit without printing a message.  */
-    while (1);
-
-  entered = 1;
-
-  fputs("\n_abort - Heap corruption\n" "File: ", stderr);
-  fputs(file, stderr);
-  fputs("  Line: ", stderr);
-  fputs(line, stderr);
-  fputs("\n\n", stderr);
-  fputs("hmm_dflt_abort: while(1)!!!\n", stderr);
-  fflush(stderr);
-
-  while (1);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_grow.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_grow.c
deleted file mode 100644
index 0e86373748a..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_grow.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(grow_chunk)(U(descriptor) *desc, void *end, U(size_bau) n_baus) {
-#undef HEAD_PTR
-#define HEAD_PTR ((head_record *) end)
-
-  end = BAUS_BACKWARD(end, DUMMY_END_BLOCK_BAUS);
-
-#ifdef HMM_AUDIT_FAIL
-
-  if (HEAD_PTR->block_size != 0)
-    /* Chunk does not have valid dummy end block. */
-    HMM_AUDIT_FAIL
-
-#endif
-
-    /* Create a new block that absorbs the old dummy end block. */
-    HEAD_PTR->block_size = n_baus;
-
-  /* Set up the new dummy end block. */
-  {
-    head_record *dummy = (head_record *) BAUS_FORWARD(end, n_baus);
-    dummy->previous_block_size = n_baus;
-    dummy->block_size = 0;
-  }
-
-  /* Simply free the new block, allowing it to coalesce with any
-  ** free block at that was the last block in the chunk prior to
-  ** growth.
-  */
-  U(free)(desc, HEAD_TO_PTR_REC(end));
-
-#undef HEAD_PTR
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_largest.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_largest.c
deleted file mode 100644
index 192758df909..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_largest.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-U(size_aau) U(largest_available)(U(descriptor) *desc) {
-  U(size_bau) largest;
-
-  if (!(desc->avl_tree_root))
-    largest = 0;
-  else {
-#ifdef HMM_AUDIT_FAIL
-    /* Audit root block in AVL tree. */
-    AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-    largest =
-      BLOCK_BAUS(
-        PTR_REC_TO_HEAD(
-          U(avl_search)(
-            (U(avl_avl) *) & (desc->avl_tree_root),
-            (U(size_bau)) ~(U(size_bau)) 0, AVL_LESS)));
-  }
-
-  if (desc->last_freed) {
-    /* Size of last freed block. */
-    register U(size_bau) lf_size;
-
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(desc->last_freed)
-#endif
-
-    lf_size = BLOCK_BAUS(desc->last_freed);
-
-    if (lf_size > largest)
-      largest = lf_size;
-  }
-
-  /* Convert largest size to AAUs and subract head size leaving payload
-  ** size.
-  */
-  return(largest ?
-         ((largest * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT)) - HEAD_AAUS) :
-         0);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_resize.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_resize.c
deleted file mode 100644
index baa5a8f9eda..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_resize.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-int U(resize)(U(descriptor) *desc, void *mem, U(size_aau) n) {
-  U(size_aau) i;
-  head_record *next_head_ptr;
-  head_record *head_ptr = PTR_REC_TO_HEAD(mem);
-
-  /* Flag. */
-  int next_block_free;
-
-  /* Convert n from desired block size in AAUs to BAUs. */
-  n += HEAD_AAUS;
-  n = DIV_ROUND_UP(n, HMM_BLOCK_ALIGN_UNIT);
-
-  if (n < MIN_BLOCK_BAUS)
-    n = MIN_BLOCK_BAUS;
-
-#ifdef HMM_AUDIT_FAIL
-
-  AUDIT_BLOCK(head_ptr)
-
-  if (!IS_BLOCK_ALLOCATED(head_ptr))
-    HMM_AUDIT_FAIL
-
-    if (desc->avl_tree_root)
-      AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-
-#endif
-
-      i = head_ptr->block_size;
-
-  next_head_ptr =
-    (head_record *) BAUS_FORWARD(head_ptr, head_ptr->block_size);
-
-  next_block_free =
-    (next_head_ptr == desc->last_freed) ||
-    !IS_BLOCK_ALLOCATED(next_head_ptr);
-
-  if (next_block_free)
-    /* Block can expand into next free block. */
-    i += BLOCK_BAUS(next_head_ptr);
-
-  if (n > i)
-    /* Not enough room for block to expand. */
-    return(-1);
-
-  if (next_block_free) {
-#ifdef HMM_AUDIT_FAIL
-    AUDIT_BLOCK(next_head_ptr)
-#endif
-
-    if (next_head_ptr == desc->last_freed)
-      desc->last_freed = 0;
-    else
-      U(out_of_free_collection)(desc, next_head_ptr);
-
-    next_head_ptr =
-      (head_record *) BAUS_FORWARD(head_ptr, (U(size_bau)) i);
-  }
-
-  /* Set i to number of "extra" BAUs. */
-  i -= n;
-
-  if (i < MIN_BLOCK_BAUS)
-    /* Not enough extra BAUs to be a block on their own, so just keep them
-    ** in the block being resized.
-    */
-  {
-    n += i;
-    i = n;
-  } else {
-    /* There are enough "leftover" BAUs in the next block to
-    ** form a remainder block. */
-
-    head_record *rem_head_ptr;
-
-    rem_head_ptr = (head_record *) BAUS_FORWARD(head_ptr, n);
-
-    rem_head_ptr->previous_block_size = (U(size_bau)) n;
-    rem_head_ptr->block_size = (U(size_bau)) i;
-
-    if (desc->last_freed) {
-#ifdef HMM_AUDIT_FAIL
-      AUDIT_BLOCK(desc->last_freed)
-#endif
-
-      U(into_free_collection)(desc, (head_record *)(desc->last_freed));
-
-      desc->last_freed = 0;
-    }
-
-    desc->last_freed = rem_head_ptr;
-  }
-
-  head_ptr->block_size = (U(size_bau)) n;
-  next_head_ptr->previous_block_size = (U(size_bau)) i;
-
-  return(0);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_shrink.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_shrink.c
deleted file mode 100644
index f80aeead7a0..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_shrink.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-void U(shrink_chunk)(U(descriptor) *desc, U(size_bau) n_baus_to_shrink) {
-  head_record *dummy_end_block = (head_record *)
-                                 BAUS_BACKWARD(desc->end_of_shrinkable_chunk, DUMMY_END_BLOCK_BAUS);
-
-#ifdef HMM_AUDIT_FAIL
-
-  if (dummy_end_block->block_size != 0)
-    /* Chunk does not have valid dummy end block. */
-    HMM_AUDIT_FAIL
-
-#endif
-
-    if (n_baus_to_shrink) {
-      head_record *last_block = (head_record *)
-                                BAUS_BACKWARD(
-                                  dummy_end_block, dummy_end_block->previous_block_size);
-
-#ifdef HMM_AUDIT_FAIL
-      AUDIT_BLOCK(last_block)
-#endif
-
-      if (last_block == desc->last_freed) {
-        U(size_bau) bs = BLOCK_BAUS(last_block);
-
-        /* Chunk will not be shrunk out of existence if
-        ** 1.  There is at least one allocated block in the chunk
-        **     and the amount to shrink is exactly the size of the
-        **     last block, OR
-        ** 2.  After the last block is shrunk, there will be enough
-        **     BAUs left in it to form a minimal size block. */
-        int chunk_will_survive =
-          (PREV_BLOCK_BAUS(last_block) && (n_baus_to_shrink == bs)) ||
-          (n_baus_to_shrink <= (U(size_bau))(bs - MIN_BLOCK_BAUS));
-
-        if (chunk_will_survive ||
-            (!PREV_BLOCK_BAUS(last_block) &&
-             (n_baus_to_shrink ==
-              (U(size_bau))(bs + DUMMY_END_BLOCK_BAUS)))) {
-          desc->last_freed = 0;
-
-          if (chunk_will_survive) {
-            bs -= n_baus_to_shrink;
-
-            if (bs) {
-              /* The last (non-dummy) block was not completely
-              ** eliminated by the shrink. */
-
-              last_block->block_size = bs;
-
-              /* Create new dummy end record.
-              */
-              dummy_end_block =
-                (head_record *) BAUS_FORWARD(last_block, bs);
-              dummy_end_block->previous_block_size = bs;
-              dummy_end_block->block_size = 0;
-
-#ifdef HMM_AUDIT_FAIL
-
-              if (desc->avl_tree_root)
-                AUDIT_BLOCK(PTR_REC_TO_HEAD(desc->avl_tree_root))
-#endif
-
-                U(into_free_collection)(desc, last_block);
-            } else {
-              /* The last (non-dummy) block was completely
-              ** eliminated by the shrink.  Make its head
-              ** the new dummy end block.
-              */
-              last_block->block_size = 0;
-              last_block->previous_block_size &= ~HIGH_BIT_BAU_SIZE;
-            }
-          }
-        }
-
-#ifdef HMM_AUDIT_FAIL
-        else
-          HMM_AUDIT_FAIL
-#endif
-        }
-
-#ifdef HMM_AUDIT_FAIL
-      else
-        HMM_AUDIT_FAIL
-#endif
-      }
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_true.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_true.c
deleted file mode 100644
index 4428c3e34a2..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/hmm_true.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#include "hmm_intrnl.h"
-
-U(size_aau) U(true_size)(void *payload_ptr) {
-  register  head_record *head_ptr = PTR_REC_TO_HEAD(payload_ptr);
-
-#ifdef HMM_AUDIT_FAIL
-  AUDIT_BLOCK(head_ptr)
-#endif
-
-  /* Convert block size from BAUs to AAUs.  Subtract head size, leaving
-  ** payload size.
-  */
-  return(
-          (BLOCK_BAUS(head_ptr) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT)) -
-          HEAD_AAUS);
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_if.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_if.h
deleted file mode 100644
index a5ced8bb7b1..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_if.h
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
-
-/* Abstract AVL Tree Generic C Package.
-** Interface generation header file.
-**
-** This code is in the public domain.  See cavl_tree.html for interface
-** documentation.
-**
-** Version: 1.5  Author: Walt Karas
-*/
-
-/* This header contains the definition of CHAR_BIT (number of bits in a
-** char). */
-#include <limits.h>
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_SC
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-
-#ifndef AVL_SEARCH_TYPE_DEFINED_
-#define AVL_SEARCH_TYPE_DEFINED_
-
-typedef enum {
-  AVL_EQUAL = 1,
-  AVL_LESS = 2,
-  AVL_GREATER = 4,
-  AVL_LESS_EQUAL = AVL_EQUAL | AVL_LESS,
-  AVL_GREATER_EQUAL = AVL_EQUAL | AVL_GREATER
-}
-avl_search_type;
-
-#endif
-
-#ifdef AVL_UNIQUE
-
-#define L_ AVL_UNIQUE
-
-#else
-
-#define L_(X) X
-
-#endif
-
-/* Determine storage class for function prototypes. */
-#ifdef AVL_PRIVATE
-
-#define L_SC static
-
-#else
-
-#define L_SC extern
-
-#endif
-
-#ifdef AVL_SIZE
-
-#define L_SIZE AVL_SIZE
-
-#else
-
-#define L_SIZE unsigned long
-
-#endif
-
-typedef struct {
-#ifdef AVL_INSIDE_STRUCT
-
-  AVL_INSIDE_STRUCT
-
-#endif
-
-  AVL_HANDLE root;
-}
-L_(avl);
-
-/* Function prototypes. */
-
-L_SC void L_(init)(L_(avl) *tree);
-
-L_SC int L_(is_empty)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(insert)(L_(avl) *tree, AVL_HANDLE h);
-
-L_SC AVL_HANDLE L_(search)(L_(avl) *tree, AVL_KEY k, avl_search_type st);
-
-L_SC AVL_HANDLE L_(search_least)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(search_greatest)(L_(avl) *tree);
-
-L_SC AVL_HANDLE L_(remove)(L_(avl) *tree, AVL_KEY k);
-
-L_SC AVL_HANDLE L_(subst)(L_(avl) *tree, AVL_HANDLE new_node);
-
-#ifdef AVL_BUILD_ITER_TYPE
-
-L_SC int L_(build)(
-  L_(avl) *tree, AVL_BUILD_ITER_TYPE p, L_SIZE num_nodes);
-
-#endif
-
-/* ANSI C/ISO C++ require that a long have at least 32 bits.  Set
-** L_EST_LONG_BIT to be the greatest multiple of 8 in the range
-** 32 - 64 (inclusive) that is less than or equal to the number of
-** bits in a long.
-*/
-
-#if (((LONG_MAX >> 31) >> 7) == 0)
-
-#define L_EST_LONG_BIT 32
-
-#elif (((LONG_MAX >> 31) >> 15) == 0)
-
-#define L_EST_LONG_BIT 40
-
-#elif (((LONG_MAX >> 31) >> 23) == 0)
-
-#define L_EST_LONG_BIT 48
-
-#elif (((LONG_MAX >> 31) >> 31) == 0)
-
-#define L_EST_LONG_BIT 56
-
-#else
-
-#define L_EST_LONG_BIT 64
-
-#endif
-
-/* Number of bits in a long. */
-#define L_LONG_BIT (sizeof(long) * CHAR_BIT)
-
-/* The macro L_BIT_ARR_DEFN defines a bit array whose index is a (0-based)
-** node depth.  The definition depends on whether the maximum depth is more
-** or less than the number of bits in a single long.
-*/
-
-#if ((AVL_MAX_DEPTH) > L_EST_LONG_BIT)
-
-/* Maximum depth may be more than number of bits in a long. */
-
-#define L_BIT_ARR_DEFN(NAME) \
-  unsigned long NAME[((AVL_MAX_DEPTH) + L_LONG_BIT - 1) / L_LONG_BIT];
-
-#else
-
-/* Maximum depth is definitely less than number of bits in a long. */
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME;
-
-#endif
-
-/* Iterator structure. */
-typedef struct {
-  /* Tree being iterated over. */
-  L_(avl) *tree_;
-
-  /* Records a path into the tree.  If bit n is true, indicates
-  ** take greater branch from the nth node in the path, otherwise
-  ** take the less branch.  bit 0 gives branch from root, and
-  ** so on. */
-  L_BIT_ARR_DEFN(branch)
-
-  /* Zero-based depth of path into tree. */
-  unsigned depth;
-
-  /* Handles of nodes in path from root to current node (returned by *). */
-  AVL_HANDLE path_h[(AVL_MAX_DEPTH) - 1];
-}
-L_(iter);
-
-/* Iterator function prototypes. */
-
-L_SC void L_(start_iter)(
-  L_(avl) *tree, L_(iter) *iter, AVL_KEY k, avl_search_type st);
-
-L_SC void L_(start_iter_least)(L_(avl) *tree, L_(iter) *iter);
-
-L_SC void L_(start_iter_greatest)(L_(avl) *tree, L_(iter) *iter);
-
-L_SC AVL_HANDLE L_(get_iter)(L_(iter) *iter);
-
-L_SC void L_(incr_iter)(L_(iter) *iter);
-
-L_SC void L_(decr_iter)(L_(iter) *iter);
-
-L_SC void L_(init_iter)(L_(iter) *iter);
-
-#define AVL_IMPL_INIT           1
-#define AVL_IMPL_IS_EMPTY       (1 << 1)
-#define AVL_IMPL_INSERT         (1 << 2)
-#define AVL_IMPL_SEARCH         (1 << 3)
-#define AVL_IMPL_SEARCH_LEAST       (1 << 4)
-#define AVL_IMPL_SEARCH_GREATEST    (1 << 5)
-#define AVL_IMPL_REMOVE         (1 << 6)
-#define AVL_IMPL_BUILD          (1 << 7)
-#define AVL_IMPL_START_ITER     (1 << 8)
-#define AVL_IMPL_START_ITER_LEAST   (1 << 9)
-#define AVL_IMPL_START_ITER_GREATEST    (1 << 10)
-#define AVL_IMPL_GET_ITER       (1 << 11)
-#define AVL_IMPL_INCR_ITER      (1 << 12)
-#define AVL_IMPL_DECR_ITER      (1 << 13)
-#define AVL_IMPL_INIT_ITER      (1 << 14)
-#define AVL_IMPL_SUBST          (1 << 15)
-
-#define AVL_IMPL_ALL            (~0)
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_SC
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IF_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_impl.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_impl.h
deleted file mode 100644
index 8b9ae27a8c9..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/cavl_impl.h
+++ /dev/null
@@ -1,1152 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
-
-/* Abstract AVL Tree Generic C Package.
-** Implementation generation header file.
-**
-** This code is in the public domain.  See cavl_tree.html for interface
-** documentation.
-**
-** Version: 1.5  Author: Walt Karas
-*/
-
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef l_tree
-#undef L_MASK_HIGH_BIT
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-#undef L_BIT_ARR_VAL
-#undef L_BIT_ARR_0
-#undef L_BIT_ARR_1
-#undef L_BIT_ARR_ALL
-#undef L_BIT_ARR_LONGS
-#undef L_IMPL_MASK
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_SC
-#undef L_BALANCE_PARAM_PREFIX
-
-#ifdef AVL_UNIQUE
-
-#define L_ AVL_UNIQUE
-
-#else
-
-#define L_(X) X
-
-#endif
-
-/* Determine correct storage class for functions */
-#ifdef AVL_PRIVATE
-
-#define L_SC static
-
-#else
-
-#define L_SC
-
-#endif
-
-#ifdef AVL_SIZE
-
-#define L_SIZE AVL_SIZE
-
-#else
-
-#define L_SIZE unsigned long
-
-#endif
-
-#define L_MASK_HIGH_BIT ((int) ~ ((~ (unsigned) 0) >> 1))
-
-/* ANSI C/ISO C++ require that a long have at least 32 bits.  Set
-** L_EST_LONG_BIT to be the greatest multiple of 8 in the range
-** 32 - 64 (inclusive) that is less than or equal to the number of
-** bits in a long.
-*/
-
-#if (((LONG_MAX >> 31) >> 7) == 0)
-
-#define L_EST_LONG_BIT 32
-
-#elif (((LONG_MAX >> 31) >> 15) == 0)
-
-#define L_EST_LONG_BIT 40
-
-#elif (((LONG_MAX >> 31) >> 23) == 0)
-
-#define L_EST_LONG_BIT 48
-
-#elif (((LONG_MAX >> 31) >> 31) == 0)
-
-#define L_EST_LONG_BIT 56
-
-#else
-
-#define L_EST_LONG_BIT 64
-
-#endif
-
-#define L_LONG_BIT (sizeof(long) * CHAR_BIT)
-
-#if ((AVL_MAX_DEPTH) > L_EST_LONG_BIT)
-
-/* The maximum depth may be greater than the number of bits in a long,
-** so multiple longs are needed to hold a bit array indexed by node
-** depth. */
-
-#define L_BIT_ARR_LONGS (((AVL_MAX_DEPTH) + L_LONG_BIT - 1) / L_LONG_BIT)
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME[L_BIT_ARR_LONGS];
-
-#define L_BIT_ARR_VAL(BIT_ARR, BIT_NUM) \
-  ((BIT_ARR)[(BIT_NUM) / L_LONG_BIT] & (1L << ((BIT_NUM) % L_LONG_BIT)))
-
-#define L_BIT_ARR_0(BIT_ARR, BIT_NUM) \
-  (BIT_ARR)[(BIT_NUM) / L_LONG_BIT] &= ~(1L << ((BIT_NUM) % L_LONG_BIT));
-
-#define L_BIT_ARR_1(BIT_ARR, BIT_NUM) \
-  (BIT_ARR)[(BIT_NUM) / L_LONG_BIT] |= 1L << ((BIT_NUM) % L_LONG_BIT);
-
-#define L_BIT_ARR_ALL(BIT_ARR, BIT_VAL) \
-  { int i = L_BIT_ARR_LONGS; do (BIT_ARR)[--i] = 0L - (BIT_VAL); while(i); }
-
-#else /* The bit array can definitely fit in one long */
-
-#define L_BIT_ARR_DEFN(NAME) unsigned long NAME;
-
-#define L_BIT_ARR_VAL(BIT_ARR, BIT_NUM) ((BIT_ARR) & (1L << (BIT_NUM)))
-
-#define L_BIT_ARR_0(BIT_ARR, BIT_NUM) (BIT_ARR) &= ~(1L << (BIT_NUM));
-
-#define L_BIT_ARR_1(BIT_ARR, BIT_NUM) (BIT_ARR) |= 1L << (BIT_NUM);
-
-#define L_BIT_ARR_ALL(BIT_ARR, BIT_VAL) (BIT_ARR) = 0L - (BIT_VAL);
-
-#endif
-
-#ifdef AVL_READ_ERRORS_HAPPEN
-
-#define L_CHECK_READ_ERROR(ERROR_RETURN) \
-  { if (AVL_READ_ERROR) return(ERROR_RETURN); }
-
-#else
-
-#define L_CHECK_READ_ERROR(ERROR_RETURN)
-
-#endif
-
-/* The presumed reason that an instantiation places additional fields
-** inside the AVL tree structure is that the SET_ and GET_ macros
-** need these fields.  The "balance" function does not explicitly use
-** any fields in the AVL tree structure, so only pass an AVL tree
-** structure pointer to "balance" if it has instantiation-specific
-** fields that are (presumably) needed by the SET_/GET_ calls within
-** "balance".
-*/
-#ifdef AVL_INSIDE_STRUCT
-
-#define L_BALANCE_PARAM_CALL_PREFIX l_tree,
-#define L_BALANCE_PARAM_DECL_PREFIX L_(avl) *l_tree,
-
-#else
-
-#define L_BALANCE_PARAM_CALL_PREFIX
-#define L_BALANCE_PARAM_DECL_PREFIX
-
-#endif
-
-#ifdef AVL_IMPL_MASK
-
-#define L_IMPL_MASK (AVL_IMPL_MASK)
-
-#else
-
-/* Define all functions. */
-#define L_IMPL_MASK AVL_IMPL_ALL
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INIT)
-
-L_SC void L_(init)(L_(avl) *l_tree) {
-  l_tree->root = AVL_NULL;
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_IS_EMPTY)
-
-L_SC int L_(is_empty)(L_(avl) *l_tree) {
-  return(l_tree->root == AVL_NULL);
-}
-
-#endif
-
-/* Put the private balance function in the same compilation module as
-** the insert function.  */
-#if (L_IMPL_MASK & AVL_IMPL_INSERT)
-
-/* Balances subtree, returns handle of root node of subtree after balancing.
-*/
-L_SC AVL_HANDLE L_(balance)(L_BALANCE_PARAM_DECL_PREFIX AVL_HANDLE bal_h) {
-  AVL_HANDLE deep_h;
-
-  /* Either the "greater than" or the "less than" subtree of
-  ** this node has to be 2 levels deeper (or else it wouldn't
-  ** need balancing).
-  */
-  if (AVL_GET_BALANCE_FACTOR(bal_h) > 0) {
-    /* "Greater than" subtree is deeper. */
-
-    deep_h = AVL_GET_GREATER(bal_h, 1);
-
-    L_CHECK_READ_ERROR(AVL_NULL)
-
-    if (AVL_GET_BALANCE_FACTOR(deep_h) < 0) {
-      int bf;
-
-      AVL_HANDLE old_h = bal_h;
-      bal_h = AVL_GET_LESS(deep_h, 1);
-      L_CHECK_READ_ERROR(AVL_NULL)
-      AVL_SET_GREATER(old_h, AVL_GET_LESS(bal_h, 1))
-      AVL_SET_LESS(deep_h, AVL_GET_GREATER(bal_h, 1))
-      AVL_SET_LESS(bal_h, old_h)
-      AVL_SET_GREATER(bal_h, deep_h)
-
-      bf = AVL_GET_BALANCE_FACTOR(bal_h);
-
-      if (bf != 0) {
-        if (bf > 0) {
-          AVL_SET_BALANCE_FACTOR(old_h, -1)
-          AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        } else {
-          AVL_SET_BALANCE_FACTOR(deep_h, 1)
-          AVL_SET_BALANCE_FACTOR(old_h, 0)
-        }
-
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      } else {
-        AVL_SET_BALANCE_FACTOR(old_h, 0)
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-      }
-    } else {
-      AVL_SET_GREATER(bal_h, AVL_GET_LESS(deep_h, 0))
-      AVL_SET_LESS(deep_h, bal_h)
-
-      if (AVL_GET_BALANCE_FACTOR(deep_h) == 0) {
-        AVL_SET_BALANCE_FACTOR(deep_h, -1)
-        AVL_SET_BALANCE_FACTOR(bal_h, 1)
-      } else {
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      }
-
-      bal_h = deep_h;
-    }
-  } else {
-    /* "Less than" subtree is deeper. */
-
-    deep_h = AVL_GET_LESS(bal_h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-
-    if (AVL_GET_BALANCE_FACTOR(deep_h) > 0) {
-      int bf;
-      AVL_HANDLE old_h = bal_h;
-      bal_h = AVL_GET_GREATER(deep_h, 1);
-      L_CHECK_READ_ERROR(AVL_NULL)
-      AVL_SET_LESS(old_h, AVL_GET_GREATER(bal_h, 0))
-      AVL_SET_GREATER(deep_h, AVL_GET_LESS(bal_h, 0))
-      AVL_SET_GREATER(bal_h, old_h)
-      AVL_SET_LESS(bal_h, deep_h)
-
-      bf = AVL_GET_BALANCE_FACTOR(bal_h);
-
-      if (bf != 0) {
-        if (bf < 0) {
-          AVL_SET_BALANCE_FACTOR(old_h, 1)
-          AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        } else {
-          AVL_SET_BALANCE_FACTOR(deep_h, -1)
-          AVL_SET_BALANCE_FACTOR(old_h, 0)
-        }
-
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      } else {
-        AVL_SET_BALANCE_FACTOR(old_h, 0)
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-      }
-    } else {
-      AVL_SET_LESS(bal_h, AVL_GET_GREATER(deep_h, 0))
-      AVL_SET_GREATER(deep_h, bal_h)
-
-      if (AVL_GET_BALANCE_FACTOR(deep_h) == 0) {
-        AVL_SET_BALANCE_FACTOR(deep_h, 1)
-        AVL_SET_BALANCE_FACTOR(bal_h, -1)
-      } else {
-        AVL_SET_BALANCE_FACTOR(deep_h, 0)
-        AVL_SET_BALANCE_FACTOR(bal_h, 0)
-      }
-
-      bal_h = deep_h;
-    }
-  }
-
-  return(bal_h);
-}
-
-L_SC AVL_HANDLE L_(insert)(L_(avl) *l_tree, AVL_HANDLE h) {
-  AVL_SET_LESS(h, AVL_NULL)
-  AVL_SET_GREATER(h, AVL_NULL)
-  AVL_SET_BALANCE_FACTOR(h, 0)
-
-  if (l_tree->root == AVL_NULL)
-    l_tree->root = h;
-  else {
-    /* Last unbalanced node encountered in search for insertion point. */
-    AVL_HANDLE unbal = AVL_NULL;
-    /* Parent of last unbalanced node. */
-    AVL_HANDLE parent_unbal = AVL_NULL;
-    /* Balance factor of last unbalanced node. */
-    int unbal_bf;
-
-    /* Zero-based depth in tree. */
-    unsigned depth = 0, unbal_depth = 0;
-
-    /* Records a path into the tree.  If bit n is true, indicates
-    ** take greater branch from the nth node in the path, otherwise
-    ** take the less branch.  bit 0 gives branch from root, and
-    ** so on. */
-    L_BIT_ARR_DEFN(branch)
-
-    AVL_HANDLE hh = l_tree->root;
-    AVL_HANDLE parent = AVL_NULL;
-    int cmp;
-
-    do {
-      if (AVL_GET_BALANCE_FACTOR(hh) != 0) {
-        unbal = hh;
-        parent_unbal = parent;
-        unbal_depth = depth;
-      }
-
-      cmp = AVL_COMPARE_NODE_NODE(h, hh);
-
-      if (cmp == 0)
-        /* Duplicate key. */
-        return(hh);
-
-      parent = hh;
-
-      if (cmp > 0) {
-        hh = AVL_GET_GREATER(hh, 1);
-        L_BIT_ARR_1(branch, depth)
-      } else {
-        hh = AVL_GET_LESS(hh, 1);
-        L_BIT_ARR_0(branch, depth)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-      depth++;
-    } while (hh != AVL_NULL);
-
-    /*  Add node to insert as leaf of tree. */
-    if (cmp < 0)
-      AVL_SET_LESS(parent, h)
-      else
-        AVL_SET_GREATER(parent, h)
-
-        depth = unbal_depth;
-
-    if (unbal == AVL_NULL)
-      hh = l_tree->root;
-    else {
-      cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-      depth++;
-      unbal_bf = AVL_GET_BALANCE_FACTOR(unbal);
-
-      if (cmp < 0)
-        unbal_bf--;
-      else  /* cmp > 0 */
-        unbal_bf++;
-
-      hh = cmp < 0 ? AVL_GET_LESS(unbal, 1) : AVL_GET_GREATER(unbal, 1);
-      L_CHECK_READ_ERROR(AVL_NULL)
-
-      if ((unbal_bf != -2) && (unbal_bf != 2)) {
-        /* No rebalancing of tree is necessary. */
-        AVL_SET_BALANCE_FACTOR(unbal, unbal_bf)
-        unbal = AVL_NULL;
-      }
-    }
-
-    if (hh != AVL_NULL)
-      while (h != hh) {
-        cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-        depth++;
-
-        if (cmp < 0) {
-          AVL_SET_BALANCE_FACTOR(hh, -1)
-          hh = AVL_GET_LESS(hh, 1);
-        } else { /* cmp > 0 */
-          AVL_SET_BALANCE_FACTOR(hh, 1)
-          hh = AVL_GET_GREATER(hh, 1);
-        }
-
-        L_CHECK_READ_ERROR(AVL_NULL)
-      }
-
-    if (unbal != AVL_NULL) {
-      unbal = L_(balance)(L_BALANCE_PARAM_CALL_PREFIX unbal);
-      L_CHECK_READ_ERROR(AVL_NULL)
-
-      if (parent_unbal == AVL_NULL)
-        l_tree->root = unbal;
-      else {
-        depth = unbal_depth - 1;
-        cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-
-        if (cmp < 0)
-          AVL_SET_LESS(parent_unbal, unbal)
-          else  /* cmp > 0 */
-            AVL_SET_GREATER(parent_unbal, unbal)
-          }
-    }
-
-  }
-
-  return(h);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH)
-
-L_SC AVL_HANDLE L_(search)(L_(avl) *l_tree, AVL_KEY k, avl_search_type st) {
-  int cmp, target_cmp;
-  AVL_HANDLE match_h = AVL_NULL;
-  AVL_HANDLE h = l_tree->root;
-
-  if (st & AVL_LESS)
-    target_cmp = 1;
-  else if (st & AVL_GREATER)
-    target_cmp = -1;
-  else
-    target_cmp = 0;
-
-  while (h != AVL_NULL) {
-    cmp = AVL_COMPARE_KEY_NODE(k, h);
-
-    if (cmp == 0) {
-      if (st & AVL_EQUAL) {
-        match_h = h;
-        break;
-      }
-
-      cmp = -target_cmp;
-    } else if (target_cmp != 0)
-      if (!((cmp ^ target_cmp) & L_MASK_HIGH_BIT))
-        /* cmp and target_cmp are both positive or both negative. */
-        match_h = h;
-
-    h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  return(match_h);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH_LEAST)
-
-L_SC AVL_HANDLE L_(search_least)(L_(avl) *l_tree) {
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-
-  while (h != AVL_NULL) {
-    parent = h;
-    h = AVL_GET_LESS(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  return(parent);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SEARCH_GREATEST)
-
-L_SC AVL_HANDLE L_(search_greatest)(L_(avl) *l_tree) {
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-
-  while (h != AVL_NULL) {
-    parent = h;
-    h = AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  return(parent);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_REMOVE)
-
-/* Prototype of balance function (called by remove) in case not in
-** same compilation unit.
-*/
-L_SC AVL_HANDLE L_(balance)(L_BALANCE_PARAM_DECL_PREFIX AVL_HANDLE bal_h);
-
-L_SC AVL_HANDLE L_(remove)(L_(avl) *l_tree, AVL_KEY k) {
-  /* Zero-based depth in tree. */
-  unsigned depth = 0, rm_depth;
-
-  /* Records a path into the tree.  If bit n is true, indicates
-  ** take greater branch from the nth node in the path, otherwise
-  ** take the less branch.  bit 0 gives branch from root, and
-  ** so on. */
-  L_BIT_ARR_DEFN(branch)
-
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-  AVL_HANDLE child;
-  AVL_HANDLE path;
-  int cmp, cmp_shortened_sub_with_path;
-  int reduced_depth;
-  int bf;
-  AVL_HANDLE rm;
-  AVL_HANDLE parent_rm;
-
-  for (;;) {
-    if (h == AVL_NULL)
-      /* No node in tree with given key. */
-      return(AVL_NULL);
-
-    cmp = AVL_COMPARE_KEY_NODE(k, h);
-
-    if (cmp == 0)
-      /* Found node to remove. */
-      break;
-
-    parent = h;
-
-    if (cmp > 0) {
-      h = AVL_GET_GREATER(h, 1);
-      L_BIT_ARR_1(branch, depth)
-    } else {
-      h = AVL_GET_LESS(h, 1);
-      L_BIT_ARR_0(branch, depth)
-    }
-
-    L_CHECK_READ_ERROR(AVL_NULL)
-    depth++;
-    cmp_shortened_sub_with_path = cmp;
-  }
-
-  rm = h;
-  parent_rm = parent;
-  rm_depth = depth;
-
-  /* If the node to remove is not a leaf node, we need to get a
-  ** leaf node, or a node with a single leaf as its child, to put
-  ** in the place of the node to remove.  We will get the greatest
-  ** node in the less subtree (of the node to remove), or the least
-  ** node in the greater subtree.  We take the leaf node from the
-  ** deeper subtree, if there is one. */
-
-  if (AVL_GET_BALANCE_FACTOR(h) < 0) {
-    child = AVL_GET_LESS(h, 1);
-    L_BIT_ARR_0(branch, depth)
-    cmp = -1;
-  } else {
-    child = AVL_GET_GREATER(h, 1);
-    L_BIT_ARR_1(branch, depth)
-    cmp = 1;
-  }
-
-  L_CHECK_READ_ERROR(AVL_NULL)
-  depth++;
-
-  if (child != AVL_NULL) {
-    cmp = -cmp;
-
-    do {
-      parent = h;
-      h = child;
-
-      if (cmp < 0) {
-        child = AVL_GET_LESS(h, 1);
-        L_BIT_ARR_0(branch, depth)
-      } else {
-        child = AVL_GET_GREATER(h, 1);
-        L_BIT_ARR_1(branch, depth)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-      depth++;
-    } while (child != AVL_NULL);
-
-    if (parent == rm)
-      /* Only went through do loop once.  Deleted node will be replaced
-      ** in the tree structure by one of its immediate children. */
-      cmp_shortened_sub_with_path = -cmp;
-    else
-      cmp_shortened_sub_with_path = cmp;
-
-    /* Get the handle of the opposite child, which may not be null. */
-    child = cmp > 0 ? AVL_GET_LESS(h, 0) : AVL_GET_GREATER(h, 0);
-  }
-
-  if (parent == AVL_NULL)
-    /* There were only 1 or 2 nodes in this tree. */
-    l_tree->root = child;
-  else if (cmp_shortened_sub_with_path < 0)
-    AVL_SET_LESS(parent, child)
-    else
-      AVL_SET_GREATER(parent, child)
-
-      /* "path" is the parent of the subtree being eliminated or reduced
-      ** from a depth of 2 to 1.  If "path" is the node to be removed, we
-      ** set path to the node we're about to poke into the position of the
-      ** node to be removed. */
-      path = parent == rm ? h : parent;
-
-  if (h != rm) {
-    /* Poke in the replacement for the node to be removed. */
-    AVL_SET_LESS(h, AVL_GET_LESS(rm, 0))
-    AVL_SET_GREATER(h, AVL_GET_GREATER(rm, 0))
-    AVL_SET_BALANCE_FACTOR(h, AVL_GET_BALANCE_FACTOR(rm))
-
-    if (parent_rm == AVL_NULL)
-      l_tree->root = h;
-    else {
-      depth = rm_depth - 1;
-
-      if (L_BIT_ARR_VAL(branch, depth))
-        AVL_SET_GREATER(parent_rm, h)
-        else
-          AVL_SET_LESS(parent_rm, h)
-        }
-  }
-
-  if (path != AVL_NULL) {
-    /* Create a temporary linked list from the parent of the path node
-    ** to the root node. */
-    h = l_tree->root;
-    parent = AVL_NULL;
-    depth = 0;
-
-    while (h != path) {
-      if (L_BIT_ARR_VAL(branch, depth)) {
-        child = AVL_GET_GREATER(h, 1);
-        AVL_SET_GREATER(h, parent)
-      } else {
-        child = AVL_GET_LESS(h, 1);
-        AVL_SET_LESS(h, parent)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-      depth++;
-      parent = h;
-      h = child;
-    }
-
-    /* Climb from the path node to the root node using the linked
-    ** list, restoring the tree structure and rebalancing as necessary.
-    */
-    reduced_depth = 1;
-    cmp = cmp_shortened_sub_with_path;
-
-    for (;;) {
-      if (reduced_depth) {
-        bf = AVL_GET_BALANCE_FACTOR(h);
-
-        if (cmp < 0)
-          bf++;
-        else  /* cmp > 0 */
-          bf--;
-
-        if ((bf == -2) || (bf == 2)) {
-          h = L_(balance)(L_BALANCE_PARAM_CALL_PREFIX h);
-          L_CHECK_READ_ERROR(AVL_NULL)
-          bf = AVL_GET_BALANCE_FACTOR(h);
-        } else
-          AVL_SET_BALANCE_FACTOR(h, bf)
-          reduced_depth = (bf == 0);
-      }
-
-      if (parent == AVL_NULL)
-        break;
-
-      child = h;
-      h = parent;
-      depth--;
-      cmp = L_BIT_ARR_VAL(branch, depth) ? 1 : -1;
-
-      if (cmp < 0) {
-        parent = AVL_GET_LESS(h, 1);
-        AVL_SET_LESS(h, child)
-      } else {
-        parent = AVL_GET_GREATER(h, 1);
-        AVL_SET_GREATER(h, child)
-      }
-
-      L_CHECK_READ_ERROR(AVL_NULL)
-    }
-
-    l_tree->root = h;
-  }
-
-  return(rm);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_SUBST)
-
-L_SC AVL_HANDLE L_(subst)(L_(avl) *l_tree, AVL_HANDLE new_node) {
-  AVL_HANDLE h = l_tree->root;
-  AVL_HANDLE parent = AVL_NULL;
-  int cmp, last_cmp;
-
-  /* Search for node already in tree with same key. */
-  for (;;) {
-    if (h == AVL_NULL)
-      /* No node in tree with same key as new node. */
-      return(AVL_NULL);
-
-    cmp = AVL_COMPARE_NODE_NODE(new_node, h);
-
-    if (cmp == 0)
-      /* Found the node to substitute new one for. */
-      break;
-
-    last_cmp = cmp;
-    parent = h;
-    h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR(AVL_NULL)
-  }
-
-  /* Copy tree housekeeping fields from node in tree to new node. */
-  AVL_SET_LESS(new_node, AVL_GET_LESS(h, 0))
-  AVL_SET_GREATER(new_node, AVL_GET_GREATER(h, 0))
-  AVL_SET_BALANCE_FACTOR(new_node, AVL_GET_BALANCE_FACTOR(h))
-
-  if (parent == AVL_NULL)
-    /* New node is also new root. */
-    l_tree->root = new_node;
-  else {
-    /* Make parent point to new node. */
-    if (last_cmp < 0)
-      AVL_SET_LESS(parent, new_node)
-      else
-        AVL_SET_GREATER(parent, new_node)
-      }
-
-  return(h);
-}
-
-#endif
-
-#ifdef AVL_BUILD_ITER_TYPE
-
-#if (L_IMPL_MASK & AVL_IMPL_BUILD)
-
-L_SC int L_(build)(
-  L_(avl) *l_tree, AVL_BUILD_ITER_TYPE p, L_SIZE num_nodes) {
-  /* Gives path to subtree being built.  If bit n is false, branch
-  ** less from the node at depth n, if true branch greater. */
-  L_BIT_ARR_DEFN(branch)
-
-  /* If bit n is true, then for the current subtree at depth n, its
-  ** greater subtree has one more node than its less subtree. */
-  L_BIT_ARR_DEFN(rem)
-
-  /* Depth of root node of current subtree. */
-  unsigned depth = 0;
-
-  /* Number of nodes in current subtree. */
-  L_SIZE num_sub = num_nodes;
-
-  /* The algorithm relies on a stack of nodes whose less subtree has
-  ** been built, but whose greater subtree has not yet been built.
-  ** The stack is implemented as linked list.  The nodes are linked
-  ** together by having the "greater" handle of a node set to the
-  ** next node in the list.  "less_parent" is the handle of the first
-  ** node in the list. */
-  AVL_HANDLE less_parent = AVL_NULL;
-
-  /* h is root of current subtree, child is one of its children. */
-  AVL_HANDLE h;
-  AVL_HANDLE child;
-
-  if (num_nodes == 0) {
-    l_tree->root = AVL_NULL;
-    return(1);
-  }
-
-  for (;;) {
-    while (num_sub > 2) {
-      /* Subtract one for root of subtree. */
-      num_sub--;
-
-      if (num_sub & 1)
-        L_BIT_ARR_1(rem, depth)
-        else
-          L_BIT_ARR_0(rem, depth)
-          L_BIT_ARR_0(branch, depth)
-          depth++;
-
-      num_sub >>= 1;
-    }
-
-    if (num_sub == 2) {
-      /* Build a subtree with two nodes, slanting to greater.
-      ** I arbitrarily chose to always have the extra node in the
-      ** greater subtree when there is an odd number of nodes to
-      ** split between the two subtrees. */
-
-      h = AVL_BUILD_ITER_VAL(p);
-      L_CHECK_READ_ERROR(0)
-      AVL_BUILD_ITER_INCR(p)
-      child = AVL_BUILD_ITER_VAL(p);
-      L_CHECK_READ_ERROR(0)
-      AVL_BUILD_ITER_INCR(p)
-      AVL_SET_LESS(child, AVL_NULL)
-      AVL_SET_GREATER(child, AVL_NULL)
-      AVL_SET_BALANCE_FACTOR(child, 0)
-      AVL_SET_GREATER(h, child)
-      AVL_SET_LESS(h, AVL_NULL)
-      AVL_SET_BALANCE_FACTOR(h, 1)
-    } else { /* num_sub == 1 */
-      /* Build a subtree with one node. */
-
-      h = AVL_BUILD_ITER_VAL(p);
-      L_CHECK_READ_ERROR(0)
-      AVL_BUILD_ITER_INCR(p)
-      AVL_SET_LESS(h, AVL_NULL)
-      AVL_SET_GREATER(h, AVL_NULL)
-      AVL_SET_BALANCE_FACTOR(h, 0)
-    }
-
-    while (depth) {
-      depth--;
-
-      if (!L_BIT_ARR_VAL(branch, depth))
-        /* We've completed a less subtree. */
-        break;
-
-      /* We've completed a greater subtree, so attach it to
-      ** its parent (that is less than it).  We pop the parent
-      ** off the stack of less parents. */
-      child = h;
-      h = less_parent;
-      less_parent = AVL_GET_GREATER(h, 1);
-      L_CHECK_READ_ERROR(0)
-      AVL_SET_GREATER(h, child)
-      /* num_sub = 2 * (num_sub - rem[depth]) + rem[depth] + 1 */
-      num_sub <<= 1;
-      num_sub += L_BIT_ARR_VAL(rem, depth) ? 0 : 1;
-
-      if (num_sub & (num_sub - 1))
-        /* num_sub is not a power of 2. */
-        AVL_SET_BALANCE_FACTOR(h, 0)
-        else
-          /* num_sub is a power of 2. */
-          AVL_SET_BALANCE_FACTOR(h, 1)
-        }
-
-    if (num_sub == num_nodes)
-      /* We've completed the full tree. */
-      break;
-
-    /* The subtree we've completed is the less subtree of the
-    ** next node in the sequence. */
-
-    child = h;
-    h = AVL_BUILD_ITER_VAL(p);
-    L_CHECK_READ_ERROR(0)
-    AVL_BUILD_ITER_INCR(p)
-    AVL_SET_LESS(h, child)
-
-    /* Put h into stack of less parents. */
-    AVL_SET_GREATER(h, less_parent)
-    less_parent = h;
-
-    /* Proceed to creating greater than subtree of h. */
-    L_BIT_ARR_1(branch, depth)
-    num_sub += L_BIT_ARR_VAL(rem, depth) ? 1 : 0;
-    depth++;
-
-  } /* end for (;; ) */
-
-  l_tree->root = h;
-
-  return(1);
-}
-
-#endif
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INIT_ITER)
-
-/* Initialize depth to invalid value, to indicate iterator is
-** invalid.   (Depth is zero-base.)  It's not necessary to initialize
-** iterators prior to passing them to the "start" function.
-*/
-L_SC void L_(init_iter)(L_(iter) *iter) {
-  iter->depth = ~0;
-}
-
-#endif
-
-#ifdef AVL_READ_ERRORS_HAPPEN
-
-#define L_CHECK_READ_ERROR_INV_DEPTH \
-  { if (AVL_READ_ERROR) { iter->depth = ~0; return; } }
-
-#else
-
-#define L_CHECK_READ_ERROR_INV_DEPTH
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER)
-
-L_SC void L_(start_iter)(
-  L_(avl) *l_tree, L_(iter) *iter, AVL_KEY k, avl_search_type st) {
-  AVL_HANDLE h = l_tree->root;
-  unsigned d = 0;
-  int cmp, target_cmp;
-
-  /* Save the tree that we're going to iterate through in a
-  ** member variable. */
-  iter->tree_ = l_tree;
-
-  iter->depth = ~0;
-
-  if (h == AVL_NULL)
-    /* Tree is empty. */
-    return;
-
-  if (st & AVL_LESS)
-    /* Key can be greater than key of starting node. */
-    target_cmp = 1;
-  else if (st & AVL_GREATER)
-    /* Key can be less than key of starting node. */
-    target_cmp = -1;
-  else
-    /* Key must be same as key of starting node. */
-    target_cmp = 0;
-
-  for (;;) {
-    cmp = AVL_COMPARE_KEY_NODE(k, h);
-
-    if (cmp == 0) {
-      if (st & AVL_EQUAL) {
-        /* Equal node was sought and found as starting node. */
-        iter->depth = d;
-        break;
-      }
-
-      cmp = -target_cmp;
-    } else if (target_cmp != 0)
-      if (!((cmp ^ target_cmp) & L_MASK_HIGH_BIT))
-        /* cmp and target_cmp are both negative or both positive. */
-        iter->depth = d;
-
-    h = cmp < 0 ? AVL_GET_LESS(h, 1) : AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-
-    if (h == AVL_NULL)
-      break;
-
-    if (cmp > 0)
-      L_BIT_ARR_1(iter->branch, d)
-      else
-        L_BIT_ARR_0(iter->branch, d)
-        iter->path_h[d++] = h;
-  }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER_LEAST)
-
-L_SC void L_(start_iter_least)(L_(avl) *l_tree, L_(iter) *iter) {
-  AVL_HANDLE h = l_tree->root;
-
-  iter->tree_ = l_tree;
-
-  iter->depth = ~0;
-
-  L_BIT_ARR_ALL(iter->branch, 0)
-
-  while (h != AVL_NULL) {
-    if (iter->depth != ~0)
-      iter->path_h[iter->depth] = h;
-
-    iter->depth++;
-    h = AVL_GET_LESS(h, 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-  }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_START_ITER_GREATEST)
-
-L_SC void L_(start_iter_greatest)(L_(avl) *l_tree, L_(iter) *iter) {
-  AVL_HANDLE h = l_tree->root;
-
-  iter->tree_ = l_tree;
-
-  iter->depth = ~0;
-
-  L_BIT_ARR_ALL(iter->branch, 1)
-
-  while (h != AVL_NULL) {
-    if (iter->depth != ~0)
-      iter->path_h[iter->depth] = h;
-
-    iter->depth++;
-    h = AVL_GET_GREATER(h, 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-  }
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_GET_ITER)
-
-L_SC AVL_HANDLE L_(get_iter)(L_(iter) *iter) {
-  if (iter->depth == ~0)
-    return(AVL_NULL);
-
-  return(iter->depth == 0 ?
-         iter->tree_->root : iter->path_h[iter->depth - 1]);
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_INCR_ITER)
-
-L_SC void L_(incr_iter)(L_(iter) *iter) {
-#define l_tree (iter->tree_)
-
-  if (iter->depth != ~0) {
-    AVL_HANDLE h =
-      AVL_GET_GREATER((iter->depth == 0 ?
-                       iter->tree_->root : iter->path_h[iter->depth - 1]), 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-
-    if (h == AVL_NULL)
-      do {
-        if (iter->depth == 0) {
-          iter->depth = ~0;
-          break;
-        }
-
-        iter->depth--;
-      } while (L_BIT_ARR_VAL(iter->branch, iter->depth));
-    else {
-      L_BIT_ARR_1(iter->branch, iter->depth)
-      iter->path_h[iter->depth++] = h;
-
-      for (;;) {
-        h = AVL_GET_LESS(h, 1);
-        L_CHECK_READ_ERROR_INV_DEPTH
-
-        if (h == AVL_NULL)
-          break;
-
-        L_BIT_ARR_0(iter->branch, iter->depth)
-        iter->path_h[iter->depth++] = h;
-      }
-    }
-  }
-
-#undef l_tree
-}
-
-#endif
-
-#if (L_IMPL_MASK & AVL_IMPL_DECR_ITER)
-
-L_SC void L_(decr_iter)(L_(iter) *iter) {
-#define l_tree (iter->tree_)
-
-  if (iter->depth != ~0) {
-    AVL_HANDLE h =
-      AVL_GET_LESS((iter->depth == 0 ?
-                    iter->tree_->root : iter->path_h[iter->depth - 1]), 1);
-    L_CHECK_READ_ERROR_INV_DEPTH
-
-    if (h == AVL_NULL)
-      do {
-        if (iter->depth == 0) {
-          iter->depth = ~0;
-          break;
-        }
-
-        iter->depth--;
-      } while (!L_BIT_ARR_VAL(iter->branch, iter->depth));
-    else {
-      L_BIT_ARR_0(iter->branch, iter->depth)
-      iter->path_h[iter->depth++] = h;
-
-      for (;;) {
-        h = AVL_GET_GREATER(h, 1);
-        L_CHECK_READ_ERROR_INV_DEPTH
-
-        if (h == AVL_NULL)
-          break;
-
-        L_BIT_ARR_1(iter->branch, iter->depth)
-        iter->path_h[iter->depth++] = h;
-      }
-    }
-  }
-
-#undef l_tree
-}
-
-#endif
-
-/* Tidy up the preprocessor symbol name space. */
-#undef L_
-#undef L_EST_LONG_BIT
-#undef L_SIZE
-#undef L_MASK_HIGH_BIT
-#undef L_LONG_BIT
-#undef L_BIT_ARR_DEFN
-#undef L_BIT_ARR_VAL
-#undef L_BIT_ARR_0
-#undef L_BIT_ARR_1
-#undef L_BIT_ARR_ALL
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_BIT_ARR_LONGS
-#undef L_IMPL_MASK
-#undef L_CHECK_READ_ERROR
-#undef L_CHECK_READ_ERROR_INV_DEPTH
-#undef L_SC
-#undef L_BALANCE_PARAM_CALL_PREFIX
-#undef L_BALANCE_PARAM_DECL_PREFIX
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_CAVL_IMPL_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/heapmm.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/heapmm.h
deleted file mode 100644
index d584b1951c5..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/heapmm.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-/* External header file for Heap Memory Manager.  See documentation in
-** heapmm.html.
-*/
-
-#undef HMM_PROCESS
-
-/* Include once per configuration in a particular translation unit. */
-
-#ifndef HMM_CNFG_NUM
-
-/* Default configuration. */
-
-#ifndef HMM_INC_CNFG_DFLT
-#define HMM_INC_CNFG_DFLT
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 0
-
-/* Test configuration. */
-
-#ifndef HMM_INC_CNFG_0
-#define HMM_INC_CNFG_0
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 1
-
-#ifndef HMM_INC_CNFG_1
-#define HMM_INC_CNFG_1
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 2
-
-#ifndef HMM_INC_CNFG_2
-#define HMM_INC_CNFG_2
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 3
-
-#ifndef HMM_INC_CNFG_3
-#define HMM_INC_CNFG_3
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 4
-
-#ifndef HMM_INC_CNFG_4
-#define HMM_INC_CNFG_4
-#define HMM_PROCESS
-#endif
-
-#elif HMM_CNFG_NUM == 5
-
-#ifndef HMM_INC_CNFG_5
-#define HMM_INC_CNFG_5
-#define HMM_PROCESS
-#endif
-
-#endif
-
-#ifdef HMM_PROCESS
-
-#include "hmm_cnfg.h"
-
-/* Heap descriptor. */
-typedef struct HMM_UNIQUE(structure) {
-  /* private: */
-
-  /* Pointer to (payload of) root node in AVL tree.  This field should
-  ** really be the AVL tree descriptor (type avl_avl).  But (in the
-  ** instantiation of the AVL tree generic package used in package) the
-  ** AVL tree descriptor simply contains a pointer to the root.  So,
-  ** whenever a pointer to the AVL tree descriptor is needed, I use the
-  ** cast:
-  **
-  ** (avl_avl *) &(heap_desc->avl_tree_root)
-  **
-  ** (where heap_desc is a pointer to a heap descriptor).  This trick
-  ** allows me to avoid including cavl_if.h in this external header. */
-  void *avl_tree_root;
-
-  /* Pointer to first byte of last block freed, after any coalescing. */
-  void *last_freed;
-
-  /* public: */
-
-  HMM_UNIQUE(size_bau) num_baus_can_shrink;
-  void *end_of_shrinkable_chunk;
-}
-HMM_UNIQUE(descriptor);
-
-/* Prototypes for externally-callable functions. */
-
-void HMM_UNIQUE(init)(HMM_UNIQUE(descriptor) *desc);
-
-void *HMM_UNIQUE(alloc)(
-  HMM_UNIQUE(descriptor) *desc, HMM_UNIQUE(size_aau) num_addr_align_units);
-
-/* NOT YET IMPLEMENTED */
-void *HMM_UNIQUE(greedy_alloc)(
-  HMM_UNIQUE(descriptor) *desc, HMM_UNIQUE(size_aau) needed_addr_align_units,
-  HMM_UNIQUE(size_aau) coveted_addr_align_units);
-
-int HMM_UNIQUE(resize)(
-  HMM_UNIQUE(descriptor) *desc, void *mem,
-  HMM_UNIQUE(size_aau) num_addr_align_units);
-
-/* NOT YET IMPLEMENTED */
-int HMM_UNIQUE(greedy_resize)(
-  HMM_UNIQUE(descriptor) *desc, void *mem,
-  HMM_UNIQUE(size_aau) needed_addr_align_units,
-  HMM_UNIQUE(size_aau) coveted_addr_align_units);
-
-void HMM_UNIQUE(free)(HMM_UNIQUE(descriptor) *desc, void *mem);
-
-HMM_UNIQUE(size_aau) HMM_UNIQUE(true_size)(void *mem);
-
-HMM_UNIQUE(size_aau) HMM_UNIQUE(largest_available)(
-  HMM_UNIQUE(descriptor) *desc);
-
-void HMM_UNIQUE(new_chunk)(
-  HMM_UNIQUE(descriptor) *desc, void *start_of_chunk,
-  HMM_UNIQUE(size_bau) num_block_align_units);
-
-void HMM_UNIQUE(grow_chunk)(
-  HMM_UNIQUE(descriptor) *desc, void *end_of_chunk,
-  HMM_UNIQUE(size_bau) num_block_align_units);
-
-/* NOT YET IMPLEMENTED */
-void HMM_UNIQUE(shrink_chunk)(
-  HMM_UNIQUE(descriptor) *desc,
-  HMM_UNIQUE(size_bau) num_block_align_units);
-
-#endif /* defined HMM_PROCESS */
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_HEAPMM_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h
deleted file mode 100644
index caa8713cfc8..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_cnfg.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-/* Configure Heap Memory Manager for processor architecture, compiler,
-** and desired performance characteristics.  This file is included
-** by heapmm.h, so these definitions can be used by code external to
-** HMM.  You can change the default configuration, and/or create alternate
-** configuration(s).
-*/
-
-/* To allow for multiple configurations of HMM to be used in the same
-** compilation unit, undefine all preprocessor symbols that will be
-** defined below.
-*/
-#undef HMM_ADDR_ALIGN_UNIT
-#undef HMM_BLOCK_ALIGN_UNIT
-#undef HMM_UNIQUE
-#undef HMM_DESC_PARAM
-#undef HMM_SYM_TO_STRING
-#undef HMM_SYM_TO_STRING
-#undef HMM_AUDIT_FAIL
-
-/* Turn X into a string after one macro expansion pass of X.  This trick
-** works with both GCC and Visual C++. */
-#define HMM_SYM_TO_STRING(X) HMM_SYM_TO_STRING(X)
-#define HMM_SYM_TO_STRING(X) #X
-
-#ifndef HMM_CNFG_NUM
-
-/* Default configuration. */
-
-/* Use hmm_ prefix to avoid identifier conflicts. */
-#define HMM_UNIQUE(BASE) hmm_ ## BASE
-
-/* Number of bytes in an Address Alignment Unit (AAU). */
-// fwg
-// #define HMM_ADDR_ALIGN_UNIT sizeof(int)
-#define HMM_ADDR_ALIGN_UNIT 32
-
-/* Number of AAUs in a Block Alignment Unit (BAU). */
-#define HMM_BLOCK_ALIGN_UNIT 1
-
-/* Type of unsigned integer big enough to hold the size of a Block in AAUs. */
-typedef unsigned long HMM_UNIQUE(size_aau);
-
-/* Type of unsigned integer big enough to hold the size of a Block/Chunk
-** in BAUs.  The high bit will be robbed. */
-typedef unsigned long HMM_UNIQUE(size_bau);
-
-void hmm_dflt_abort(const char *, const char *);
-
-/* Actions upon a self-audit failure.  Must expand to a single complete
-** statement.  If you remove the definition of this macro, no self-auditing
-** will be performed. */
-#define HMM_AUDIT_FAIL \
-  hmm_dflt_abort(__FILE__, HMM_SYM_TO_STRING(__LINE__));
-
-#elif HMM_CNFG_NUM == 0
-
-/* Definitions for testing. */
-
-#define HMM_UNIQUE(BASE) thmm_ ## BASE
-
-#define HMM_ADDR_ALIGN_UNIT sizeof(int)
-
-#define HMM_BLOCK_ALIGN_UNIT 3
-
-typedef unsigned HMM_UNIQUE(size_aau);
-
-typedef unsigned short HMM_UNIQUE(size_bau);
-
-/* Under this test setup, a long jump is done if there is a self-audit
-** failure.
-*/
-
-extern jmp_buf HMM_UNIQUE(jmp_buf);
-extern const char *HMM_UNIQUE(fail_file);
-extern unsigned HMM_UNIQUE(fail_line);
-
-#define HMM_AUDIT_FAIL \
-  { HMM_UNIQUE(fail_file) = __FILE__; HMM_UNIQUE(fail_line) = __LINE__; \
-    longjmp(HMM_UNIQUE(jmp_buf), 1); }
-
-#elif HMM_CNFG_NUM == 1
-
-/* Put configuration 1 definitions here (if there is a configuration 1). */
-
-#elif HMM_CNFG_NUM == 2
-
-/* Put configuration 2 definitions here. */
-
-#elif HMM_CNFG_NUM == 3
-
-/* Put configuration 3 definitions here. */
-
-#elif HMM_CNFG_NUM == 4
-
-/* Put configuration 4 definitions here. */
-
-#elif HMM_CNFG_NUM == 5
-
-/* Put configuration 5 definitions here. */
-
-#endif
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_CNFG_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h
deleted file mode 100644
index 7302aa28c24..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/memory_manager/include/hmm_intrnl.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/* This code is in the public domain.
-** Version: 1.1  Author: Walt Karas
-*/
-
-#ifndef VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
-#define VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
-
-#ifdef __uClinux__
-# include <lddk.h>
-#endif
-
-#include "heapmm.h"
-
-#define U(BASE) HMM_UNIQUE(BASE)
-
-/* Mask of high bit of variable of size_bau type. */
-#define HIGH_BIT_BAU_SIZE \
-  ((U(size_bau)) ~ (((U(size_bau)) ~ (U(size_bau)) 0) >> 1))
-
-/* Add a given number of AAUs to pointer. */
-#define AAUS_FORWARD(PTR, AAU_OFFSET) \
-  (((char *) (PTR)) + ((AAU_OFFSET) * ((U(size_aau)) HMM_ADDR_ALIGN_UNIT)))
-
-/* Subtract a given number of AAUs from pointer. */
-#define AAUS_BACKWARD(PTR, AAU_OFFSET) \
-  (((char *) (PTR)) - ((AAU_OFFSET) * ((U(size_aau)) HMM_ADDR_ALIGN_UNIT)))
-
-/* Add a given number of BAUs to a pointer. */
-#define BAUS_FORWARD(PTR, BAU_OFFSET) \
-  AAUS_FORWARD((PTR), (BAU_OFFSET) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT))
-
-/* Subtract a given number of BAUs to a pointer. */
-#define BAUS_BACKWARD(PTR, BAU_OFFSET) \
-  AAUS_BACKWARD((PTR), (BAU_OFFSET) * ((U(size_aau)) HMM_BLOCK_ALIGN_UNIT))
-
-typedef struct head_struct {
-  /* Sizes in Block Alignment Units. */
-  HMM_UNIQUE(size_bau) previous_block_size, block_size;
-}
-head_record;
-
-typedef struct ptr_struct {
-  struct ptr_struct *self, *prev, *next;
-}
-ptr_record;
-
-/* Divide and round up any fraction to the next whole number. */
-#define DIV_ROUND_UP(NUMER, DENOM) (((NUMER) + (DENOM) - 1) / (DENOM))
-
-/* Number of AAUs in a block head. */
-#define HEAD_AAUS DIV_ROUND_UP(sizeof(head_record), HMM_ADDR_ALIGN_UNIT)
-
-/* Number of AAUs in a block pointer record. */
-#define PTR_RECORD_AAUS DIV_ROUND_UP(sizeof(ptr_record), HMM_ADDR_ALIGN_UNIT)
-
-/* Number of BAUs in a dummy end record (at end of chunk). */
-#define DUMMY_END_BLOCK_BAUS DIV_ROUND_UP(HEAD_AAUS, HMM_BLOCK_ALIGN_UNIT)
-
-/* Minimum number of BAUs in a block (allowing room for the pointer record. */
-#define MIN_BLOCK_BAUS \
-  DIV_ROUND_UP(HEAD_AAUS + PTR_RECORD_AAUS, HMM_BLOCK_ALIGN_UNIT)
-
-/* Return number of BAUs in block (masking off high bit containing block
-** status). */
-#define BLOCK_BAUS(HEAD_PTR) \
-  (((head_record *) (HEAD_PTR))->block_size & ~HIGH_BIT_BAU_SIZE)
-
-/* Return number of BAUs in previous block (masking off high bit containing
-** block status). */
-#define PREV_BLOCK_BAUS(HEAD_PTR) \
-  (((head_record *) (HEAD_PTR))->previous_block_size & ~HIGH_BIT_BAU_SIZE)
-
-/* Set number of BAUs in previous block, preserving high bit containing
-** block status. */
-#define SET_PREV_BLOCK_BAUS(HEAD_PTR, N_BAUS) \
-  { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
-    h_ptr->previous_block_size &= HIGH_BIT_BAU_SIZE; \
-    h_ptr->previous_block_size |= (N_BAUS); }
-
-/* Convert pointer to pointer record of block to pointer to block's head
-** record. */
-#define PTR_REC_TO_HEAD(PTR_REC_PTR) \
-  ((head_record *) AAUS_BACKWARD(PTR_REC_PTR, HEAD_AAUS))
-
-/* Convert pointer to block head to pointer to block's pointer record. */
-#define HEAD_TO_PTR_REC(HEAD_PTR) \
-  ((ptr_record *) AAUS_FORWARD(HEAD_PTR, HEAD_AAUS))
-
-/* Returns non-zero if block is allocated. */
-#define IS_BLOCK_ALLOCATED(HEAD_PTR) \
-  (((((head_record *) (HEAD_PTR))->block_size | \
-     ((head_record *) (HEAD_PTR))->previous_block_size) & \
-    HIGH_BIT_BAU_SIZE) == 0)
-
-#define MARK_BLOCK_ALLOCATED(HEAD_PTR) \
-  { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
-    h_ptr->block_size &= ~HIGH_BIT_BAU_SIZE; \
-    h_ptr->previous_block_size &= ~HIGH_BIT_BAU_SIZE; }
-
-/* Mark a block as free when it is not the first block in a bin (and
-** therefore not a node in the AVL tree). */
-#define MARK_SUCCESSIVE_BLOCK_IN_FREE_BIN(HEAD_PTR) \
-  { register head_record *h_ptr = (head_record *) (HEAD_PTR); \
-    h_ptr->block_size |= HIGH_BIT_BAU_SIZE; }
-
-/* Prototypes for internal functions implemented in one file and called in
-** another.
-*/
-
-void U(into_free_collection)(U(descriptor) *desc, head_record *head_ptr);
-
-void U(out_of_free_collection)(U(descriptor) *desc, head_record *head_ptr);
-
-void *U(alloc_from_bin)(
-  U(descriptor) *desc, ptr_record *bin_front_ptr, U(size_bau) n_baus);
-
-#ifdef HMM_AUDIT_FAIL
-
-/* Simply contains a reference to the HMM_AUDIT_FAIL macro and a
-** dummy return. */
-int U(audit_block_fail_dummy_return)(void);
-
-
-/* Auditing a block consists of checking that the size in its head
-** matches the previous block size in the head of the next block. */
-#define AUDIT_BLOCK_AS_EXPR(HEAD_PTR) \
-  ((BLOCK_BAUS(HEAD_PTR) == \
-    PREV_BLOCK_BAUS(BAUS_FORWARD(HEAD_PTR, BLOCK_BAUS(HEAD_PTR)))) ? \
-   0 : U(audit_block_fail_dummy_return)())
-
-#define AUDIT_BLOCK(HEAD_PTR) \
-  { void *h_ptr = (HEAD_PTR); AUDIT_BLOCK_AS_EXPR(h_ptr); }
-
-#endif
-
-/* Interface to AVL tree generic package instantiation. */
-
-#define AVL_UNIQUE(BASE) U(avl_ ## BASE)
-
-#define AVL_HANDLE ptr_record *
-
-#define AVL_KEY U(size_bau)
-
-#define AVL_MAX_DEPTH 64
-
-#include "cavl_if.h"
-
-#endif  // VPX_MEM_MEMORY_MANAGER_INCLUDE_HMM_INTRNL_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
index da616425c69..b60d7319cc3 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c
@@ -18,113 +18,11 @@
 #include "include/vpx_mem_intrnl.h"
 #include "vpx/vpx_integer.h"
 
-#if CONFIG_MEM_TRACKER
-#ifndef VPX_NO_GLOBALS
-static unsigned long g_alloc_count = 0;
-#else
-#include "vpx_global_handling.h"
-#define g_alloc_count vpxglobalm(vpxmem,g_alloc_count)
-#endif
-#endif
-
-#if CONFIG_MEM_MANAGER
-# include "heapmm.h"
-# include "hmm_intrnl.h"
-
-# define SHIFT_HMM_ADDR_ALIGN_UNIT 5
-# define TOTAL_MEMORY_TO_ALLOCATE  20971520 /* 20 * 1024 * 1024 */
-
-# define MM_DYNAMIC_MEMORY 1
-# if MM_DYNAMIC_MEMORY
-static unsigned char *g_p_mng_memory_raw = NULL;
-static unsigned char *g_p_mng_memory     = NULL;
-# else
-static unsigned char g_p_mng_memory[TOTAL_MEMORY_TO_ALLOCATE];
-# endif
-
-static size_t g_mm_memory_size = TOTAL_MEMORY_TO_ALLOCATE;
-
-static hmm_descriptor hmm_d;
-static int g_mng_memory_allocated = 0;
-
-static int vpx_mm_create_heap_memory();
-static void *vpx_mm_realloc(void *memblk, size_t size);
-#endif /*CONFIG_MEM_MANAGER*/
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-struct GLOBAL_FUNC_POINTERS {
-  g_malloc_func g_malloc;
-  g_calloc_func g_calloc;
-  g_realloc_func g_realloc;
-  g_free_func g_free;
-  g_memcpy_func g_memcpy;
-  g_memset_func g_memset;
-  g_memmove_func g_memmove;
-} *g_func = NULL;
-
-# define VPX_MALLOC_L  g_func->g_malloc
-# define VPX_REALLOC_L g_func->g_realloc
-# define VPX_FREE_L    g_func->g_free
-# define VPX_MEMCPY_L  g_func->g_memcpy
-# define VPX_MEMSET_L  g_func->g_memset
-# define VPX_MEMMOVE_L g_func->g_memmove
-#else
-# define VPX_MALLOC_L  malloc
-# define VPX_REALLOC_L realloc
-# define VPX_FREE_L    free
-# define VPX_MEMCPY_L  memcpy
-# define VPX_MEMSET_L  memset
-# define VPX_MEMMOVE_L memmove
-#endif /* USE_GLOBAL_FUNCTION_POINTERS */
-
-unsigned int vpx_mem_get_version() {
-  unsigned int ver = ((unsigned int)(unsigned char)VPX_MEM_VERSION_CHIEF << 24 |
-                      (unsigned int)(unsigned char)VPX_MEM_VERSION_MAJOR << 16 |
-                      (unsigned int)(unsigned char)VPX_MEM_VERSION_MINOR << 8  |
-                      (unsigned int)(unsigned char)VPX_MEM_VERSION_PATCH);
-  return ver;
-}
-
-int vpx_mem_set_heap_size(size_t size) {
-  int ret = -1;
-
-#if CONFIG_MEM_MANAGER
-#if MM_DYNAMIC_MEMORY
-
-  if (!g_mng_memory_allocated && size) {
-    g_mm_memory_size = size;
-    ret = 0;
-  } else
-    ret = -3;
-
-#else
-  ret = -2;
-#endif
-#else
-  (void)size;
-#endif
-
-  return ret;
-}
-
 void *vpx_memalign(size_t align, size_t size) {
   void *addr,
        * x = NULL;
 
-#if CONFIG_MEM_MANAGER
-  int number_aau;
-
-  if (vpx_mm_create_heap_memory() < 0) {
-    _P(printf("[vpx][mm] ERROR vpx_memalign() Couldn't create memory for Heap.\n");)
-  }
-
-  number_aau = ((size + align - 1 + ADDRESS_STORAGE_SIZE) >>
-                SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
-  addr = hmm_alloc(&hmm_d, number_aau);
-#else
-  addr = VPX_MALLOC_L(size + align - 1 + ADDRESS_STORAGE_SIZE);
-#endif /*CONFIG_MEM_MANAGER*/
+  addr = malloc(size + align - 1 + ADDRESS_STORAGE_SIZE);
 
   if (addr) {
     x = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, (int)align);
@@ -145,7 +43,7 @@ void *vpx_calloc(size_t num, size_t size) {
   x = vpx_memalign(DEFAULT_ALIGNMENT, num * size);
 
   if (x)
-    VPX_MEMSET_L(x, 0, num * size);
+    memset(x, 0, num * size);
 
   return x;
 }
@@ -171,11 +69,7 @@ void *vpx_realloc(void *memblk, size_t size) {
     addr   = (void *)(((size_t *)memblk)[-1]);
     memblk = NULL;
 
-#if CONFIG_MEM_MANAGER
-    new_addr = vpx_mm_realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
-#else
-    new_addr = VPX_REALLOC_L(addr, size + align + ADDRESS_STORAGE_SIZE);
-#endif
+    new_addr = realloc(addr, size + align + ADDRESS_STORAGE_SIZE);
 
     if (new_addr) {
       addr = new_addr;
@@ -193,280 +87,12 @@ void *vpx_realloc(void *memblk, size_t size) {
 void vpx_free(void *memblk) {
   if (memblk) {
     void *addr = (void *)(((size_t *)memblk)[-1]);
-#if CONFIG_MEM_MANAGER
-    hmm_free(&hmm_d, addr);
-#else
-    VPX_FREE_L(addr);
-#endif
-  }
-}
-
-#if CONFIG_MEM_TRACKER
-void *xvpx_memalign(size_t align, size_t size, char *file, int line) {
-#if TRY_BOUNDS_CHECK
-  unsigned char *x_bounds;
-#endif
-
-  void *x;
-
-  if (g_alloc_count == 0) {
-#if TRY_BOUNDS_CHECK
-    int i_rv = vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE);
-#else
-    int i_rv = vpx_memory_tracker_init(0, 0);
-#endif
-
-    if (i_rv < 0) {
-      _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
-    }
-  }
-
-#if TRY_BOUNDS_CHECK
-  {
-    int i;
-    unsigned int tempme = BOUNDS_CHECK_VALUE;
-
-    x_bounds = vpx_memalign(align, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
-    if (x_bounds) {
-      /*we're aligning the address twice here but to keep things
-        consistent we want to have the padding come before the stored
-        address so no matter what free function gets called we will
-        attempt to free the correct address*/
-      x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
-      x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
-                     (int)align);
-      /* save the actual malloc address */
-      ((size_t *)x)[-1] = (size_t)x_bounds;
-
-      for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int)) {
-        VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
-        VPX_MEMCPY_L((unsigned char *)x + size + i,
-                     &tempme, sizeof(unsigned int));
-      }
-    } else
-      x = NULL;
-  }
-#else
-  x = vpx_memalign(align, size);
-#endif /*TRY_BOUNDS_CHECK*/
-
-  g_alloc_count++;
-
-  vpx_memory_tracker_add((size_t)x, (unsigned int)size, file, line, 1);
-
-  return x;
-}
-
-void *xvpx_malloc(size_t size, char *file, int line) {
-  return xvpx_memalign(DEFAULT_ALIGNMENT, size, file, line);
-}
-
-void *xvpx_calloc(size_t num, size_t size, char *file, int line) {
-  void *x = xvpx_memalign(DEFAULT_ALIGNMENT, num * size, file, line);
-
-  if (x)
-    VPX_MEMSET_L(x, 0, num * size);
-
-  return x;
-}
-
-void *xvpx_realloc(void *memblk, size_t size, char *file, int line) {
-  struct mem_block *p = NULL;
-  int orig_size = 0,
-      orig_line = 0;
-  char *orig_file = NULL;
-
-#if TRY_BOUNDS_CHECK
-  unsigned char *x_bounds = memblk ?
-                            (unsigned char *)(((size_t *)memblk)[-1]) :
-                            NULL;
-#endif
-
-  void *x;
-
-  if (g_alloc_count == 0) {
-#if TRY_BOUNDS_CHECK
-
-    if (!vpx_memory_tracker_init(BOUNDS_CHECK_PAD_SIZE, BOUNDS_CHECK_VALUE))
-#else
-    if (!vpx_memory_tracker_init(0, 0))
-#endif
-    {
-      _P(printf("ERROR xvpx_malloc MEM_TRACK_USAGE error vpx_memory_tracker_init().\n");)
-    }
-  }
-
-  if ((p = vpx_memory_tracker_find((size_t)memblk))) {
-    orig_size = p->size;
-    orig_file = p->file;
-    orig_line = p->line;
-  }
-
-#if TRY_BOUNDS_CHECK_ON_FREE
-  vpx_memory_tracker_check_integrity(file, line);
-#endif
-
-  /* have to do this regardless of success, because
-   * the memory that does get realloc'd may change
-   * the bounds values of this block
-   */
-  vpx_memory_tracker_remove((size_t)memblk);
-
-#if TRY_BOUNDS_CHECK
-  {
-    int i;
-    unsigned int tempme = BOUNDS_CHECK_VALUE;
-
-    x_bounds = vpx_realloc(memblk, size + (BOUNDS_CHECK_PAD_SIZE * 2));
-
-    if (x_bounds) {
-      x_bounds = (unsigned char *)(((size_t *)x_bounds)[-1]);
-      x = align_addr(x_bounds + BOUNDS_CHECK_PAD_SIZE + ADDRESS_STORAGE_SIZE,
-                     (int)DEFAULT_ALIGNMENT);
-      /* save the actual malloc address */
-      ((size_t *)x)[-1] = (size_t)x_bounds;
-
-      for (i = 0; i < BOUNDS_CHECK_PAD_SIZE; i += sizeof(unsigned int)) {
-        VPX_MEMCPY_L(x_bounds + i, &tempme, sizeof(unsigned int));
-        VPX_MEMCPY_L((unsigned char *)x + size + i,
-                     &tempme, sizeof(unsigned int));
-      }
-    } else
-      x = NULL;
+    free(addr);
   }
-#else
-  x = vpx_realloc(memblk, size);
-#endif /*TRY_BOUNDS_CHECK*/
-
-  if (!memblk) ++g_alloc_count;
-
-  if (x)
-    vpx_memory_tracker_add((size_t)x, (unsigned int)size, file, line, 1);
-  else
-    vpx_memory_tracker_add((size_t)memblk, orig_size, orig_file, orig_line, 1);
-
-  return x;
-}
-
-void xvpx_free(void *p_address, char *file, int line) {
-#if TRY_BOUNDS_CHECK
-  unsigned char *p_bounds_address = (unsigned char *)p_address;
-  /*p_bounds_address -= BOUNDS_CHECK_PAD_SIZE;*/
-#endif
-
-#if !TRY_BOUNDS_CHECK_ON_FREE
-  (void)file;
-  (void)line;
-#endif
-
-  if (p_address) {
-#if TRY_BOUNDS_CHECK_ON_FREE
-    vpx_memory_tracker_check_integrity(file, line);
-#endif
-
-    /* if the addr isn't found in the list, assume it was allocated via
-     * vpx_ calls not xvpx_, therefore it does not contain any padding
-     */
-    if (vpx_memory_tracker_remove((size_t)p_address) == -2) {
-      p_bounds_address = p_address;
-      _P(fprintf(stderr, "[vpx_mem][xvpx_free] addr: %p not found in"
-                 " list; freed from file:%s"
-                 " line:%d\n", p_address, file, line));
-    } else
-      --g_alloc_count;
-
-#if TRY_BOUNDS_CHECK
-    vpx_free(p_bounds_address);
-#else
-    vpx_free(p_address);
-#endif
-
-    if (!g_alloc_count)
-      vpx_memory_tracker_destroy();
-  }
-}
-
-#endif /*CONFIG_MEM_TRACKER*/
-
-#if CONFIG_MEM_CHECKS
-#if defined(VXWORKS)
-#include <task_lib.h> /*for task_delay()*/
-/* This function is only used to get a stack trace of the player
-object so we can se where we are having a problem. */
-static int get_my_tt(int task) {
-  tt(task);
-
-  return 0;
-}
-
-static void vx_sleep(int msec) {
-  int ticks_to_sleep = 0;
-
-  if (msec) {
-    int msec_per_tick = 1000 / sys_clk_rate_get();
-
-    if (msec < msec_per_tick)
-      ticks_to_sleep++;
-    else
-      ticks_to_sleep = msec / msec_per_tick;
-  }
-
-  task_delay(ticks_to_sleep);
-}
-#endif
-#endif
-
-void *vpx_memcpy(void *dest, const void *source, size_t length) {
-#if CONFIG_MEM_CHECKS
-
-  if (((int)dest < 0x4000) || ((int)source < 0x4000)) {
-    _P(printf("WARNING: vpx_memcpy dest:0x%x source:0x%x len:%d\n", (int)dest, (int)source, length);)
-
-#if defined(VXWORKS)
-    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-    vx_sleep(10000);
-#endif
-  }
-
-#endif
-
-  return VPX_MEMCPY_L(dest, source, length);
-}
-
-void *vpx_memset(void *dest, int val, size_t length) {
-#if CONFIG_MEM_CHECKS
-
-  if ((int)dest < 0x4000) {
-    _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n", (int)dest, val, length);)
-
-#if defined(VXWORKS)
-    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-    vx_sleep(10000);
-#endif
-  }
-
-#endif
-
-  return VPX_MEMSET_L(dest, val, length);
 }
 
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
 void *vpx_memset16(void *dest, int val, size_t length) {
-#if CONFIG_MEM_CHECKS
-  if ((int)dest < 0x4000) {
-    _P(printf("WARNING: vpx_memset dest:0x%x val:%d len:%d\n",
-              (int)dest, val, length);)
-
-#if defined(VXWORKS)
-    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-    vx_sleep(10000);
-#endif
-  }
-#endif
   int i;
   void *orig = dest;
   uint16_t *dest16 = dest;
@@ -475,207 +101,3 @@ void *vpx_memset16(void *dest, int val, size_t length) {
   return orig;
 }
 #endif  // CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
-
-void *vpx_memmove(void *dest, const void *src, size_t count) {
-#if CONFIG_MEM_CHECKS
-
-  if (((int)dest < 0x4000) || ((int)src < 0x4000)) {
-    _P(printf("WARNING: vpx_memmove dest:0x%x src:0x%x count:%d\n", (int)dest, (int)src, count);)
-
-#if defined(VXWORKS)
-    sp(get_my_tt, task_id_self(), 0, 0, 0, 0, 0, 0, 0, 0);
-
-    vx_sleep(10000);
-#endif
-  }
-
-#endif
-
-  return VPX_MEMMOVE_L(dest, src, count);
-}
-
-#if CONFIG_MEM_MANAGER
-
-static int vpx_mm_create_heap_memory() {
-  int i_rv = 0;
-
-  if (!g_mng_memory_allocated) {
-#if MM_DYNAMIC_MEMORY
-    g_p_mng_memory_raw =
-      (unsigned char *)malloc(g_mm_memory_size + HMM_ADDR_ALIGN_UNIT);
-
-    if (g_p_mng_memory_raw) {
-      g_p_mng_memory = (unsigned char *)((((unsigned int)g_p_mng_memory_raw) +
-                                          HMM_ADDR_ALIGN_UNIT - 1) &
-                                         -(int)HMM_ADDR_ALIGN_UNIT);
-
-      _P(printf("[vpx][mm] total memory size:%d g_p_mng_memory_raw:0x%x g_p_mng_memory:0x%x\n"
-, g_mm_memory_size + HMM_ADDR_ALIGN_UNIT
-, (unsigned int)g_p_mng_memory_raw
-, (unsigned int)g_p_mng_memory);)
-    } else {
-      _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-, g_mm_memory_size);)
-
-      i_rv = -1;
-    }
-
-    if (g_p_mng_memory)
-#endif
-    {
-      int chunk_size = 0;
-
-      g_mng_memory_allocated = 1;
-
-      hmm_init(&hmm_d);
-
-      chunk_size = g_mm_memory_size >> SHIFT_HMM_ADDR_ALIGN_UNIT;
-
-      chunk_size -= DUMMY_END_BLOCK_BAUS;
-
-      _P(printf("[vpx][mm] memory size:%d for vpx memory manager. g_p_mng_memory:0x%x  chunk_size:%d\n"
-, g_mm_memory_size
-, (unsigned int)g_p_mng_memory
-, chunk_size);)
-
-      hmm_new_chunk(&hmm_d, (void *)g_p_mng_memory, chunk_size);
-    }
-
-#if MM_DYNAMIC_MEMORY
-    else {
-      _P(printf("[vpx][mm] Couldn't allocate memory:%d for vpx memory manager.\n"
-, g_mm_memory_size);)
-
-      i_rv = -1;
-    }
-
-#endif
-  }
-
-  return i_rv;
-}
-
-static void *vpx_mm_realloc(void *memblk, size_t size) {
-  void *p_ret = NULL;
-
-  if (vpx_mm_create_heap_memory() < 0) {
-    _P(printf("[vpx][mm] ERROR vpx_mm_realloc() Couldn't create memory for Heap.\n");)
-  } else {
-    int i_rv = 0;
-    int old_num_aaus;
-    int new_num_aaus;
-
-    old_num_aaus = hmm_true_size(memblk);
-    new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-
-    if (old_num_aaus == new_num_aaus) {
-      p_ret = memblk;
-    } else {
-      i_rv = hmm_resize(&hmm_d, memblk, new_num_aaus);
-
-      if (i_rv == 0) {
-        p_ret = memblk;
-      } else {
-        /* Error. Try to malloc and then copy data. */
-        void *p_from_malloc;
-
-        new_num_aaus = (size >> SHIFT_HMM_ADDR_ALIGN_UNIT) + 1;
-        p_from_malloc  = hmm_alloc(&hmm_d, new_num_aaus);
-
-        if (p_from_malloc) {
-          vpx_memcpy(p_from_malloc, memblk, size);
-          hmm_free(&hmm_d, memblk);
-
-          p_ret = p_from_malloc;
-        }
-      }
-    }
-  }
-
-  return p_ret;
-}
-#endif /*CONFIG_MEM_MANAGER*/
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-# if CONFIG_MEM_TRACKER
-extern int vpx_memory_tracker_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l);
-# endif
-#endif /*USE_GLOBAL_FUNCTION_POINTERS*/
-int vpx_mem_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l) {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-  /* If use global functions is turned on then the
-  application must set the global functions before
-  it does anything else or vpx_mem will have
-  unpredictable results. */
-  if (!g_func) {
-    g_func = (struct GLOBAL_FUNC_POINTERS *)
-             g_malloc_l(sizeof(struct GLOBAL_FUNC_POINTERS));
-
-    if (!g_func) {
-      return -1;
-    }
-  }
-
-#if CONFIG_MEM_TRACKER
-  {
-    int rv = 0;
-    rv = vpx_memory_tracker_set_functions(g_malloc_l
-, g_calloc_l
-, g_realloc_l
-, g_free_l
-, g_memcpy_l
-, g_memset_l
-, g_memmove_l);
-
-    if (rv < 0) {
-      return rv;
-    }
-  }
-#endif
-
-  g_func->g_malloc  = g_malloc_l;
-  g_func->g_calloc  = g_calloc_l;
-  g_func->g_realloc = g_realloc_l;
-  g_func->g_free    = g_free_l;
-  g_func->g_memcpy  = g_memcpy_l;
-  g_func->g_memset  = g_memset_l;
-  g_func->g_memmove = g_memmove_l;
-
-  return 0;
-#else
-  (void)g_malloc_l;
-  (void)g_calloc_l;
-  (void)g_realloc_l;
-  (void)g_free_l;
-  (void)g_memcpy_l;
-  (void)g_memset_l;
-  (void)g_memmove_l;
-  return -1;
-#endif
-}
-
-int vpx_mem_unset_functions() {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-  if (g_func) {
-    g_free_func temp_free = g_func->g_free;
-    temp_free(g_func);
-    g_func = NULL;
-  }
-
-#endif
-  return 0;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
index e2391f49629..a027714a01b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h
@@ -17,27 +17,6 @@
 # include <lddk.h>
 #endif
 
-/* vpx_mem version info */
-#define vpx_mem_version "2.2.1.5"
-
-#define VPX_MEM_VERSION_CHIEF 2
-#define VPX_MEM_VERSION_MAJOR 2
-#define VPX_MEM_VERSION_MINOR 1
-#define VPX_MEM_VERSION_PATCH 5
-/* end - vpx_mem version info */
-
-#ifndef VPX_TRACK_MEM_USAGE
-# define VPX_TRACK_MEM_USAGE       0  /* enable memory tracking/integrity checks */
-#endif
-#ifndef VPX_CHECK_MEM_FUNCTIONS
-# define VPX_CHECK_MEM_FUNCTIONS   0  /* enable basic safety checks in _memcpy,
-_memset, and _memmove */
-#endif
-#ifndef REPLACE_BUILTIN_FUNCTIONS
-# define REPLACE_BUILTIN_FUNCTIONS 0  /* replace builtin functions with their
-vpx_ equivalents */
-#endif
-
 #include <stdlib.h>
 #include <stddef.h>
 
@@ -45,125 +24,17 @@ vpx_ equivalents */
 extern "C" {
 #endif
 
-  /*
-      vpx_mem_get_version()
-      provided for runtime version checking. Returns an unsigned int of the form
-      CHIEF | MAJOR | MINOR | PATCH, where the chief version number is the high
-      order byte.
-  */
-  unsigned int vpx_mem_get_version(void);
-
-  /*
-      vpx_mem_set_heap_size(size_t size)
-        size - size in bytes for the memory manager to allocate for its heap
-      Sets the memory manager's initial heap size
-      Return:
-        0: on success
-        -1: if memory manager calls have not been included in the vpx_mem lib
-        -2: if the memory manager has been compiled to use static memory
-        -3: if the memory manager has already allocated its heap
-  */
-  int vpx_mem_set_heap_size(size_t size);
-
   void *vpx_memalign(size_t align, size_t size);
   void *vpx_malloc(size_t size);
   void *vpx_calloc(size_t num, size_t size);
   void *vpx_realloc(void *memblk, size_t size);
   void vpx_free(void *memblk);
 
-  void *vpx_memcpy(void *dest, const void *src, size_t length);
-  void *vpx_memset(void *dest, int val, size_t length);
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   void *vpx_memset16(void *dest, int val, size_t length);
 #endif
-  void *vpx_memmove(void *dest, const void *src, size_t count);
-
-  /* special memory functions */
-  void *vpx_mem_alloc(int id, size_t size, size_t align);
-  void vpx_mem_free(int id, void *mem, size_t size);
-
-  /* Wrappers to standard library functions. */
-  typedef void *(* g_malloc_func)(size_t);
-  typedef void *(* g_calloc_func)(size_t, size_t);
-  typedef void *(* g_realloc_func)(void *, size_t);
-  typedef void (* g_free_func)(void *);
-  typedef void *(* g_memcpy_func)(void *, const void *, size_t);
-  typedef void *(* g_memset_func)(void *, int, size_t);
-  typedef void *(* g_memmove_func)(void *, const void *, size_t);
-
-  int vpx_mem_set_functions(g_malloc_func g_malloc_l
-, g_calloc_func g_calloc_l
-, g_realloc_func g_realloc_l
-, g_free_func g_free_l
-, g_memcpy_func g_memcpy_l
-, g_memset_func g_memset_l
-, g_memmove_func g_memmove_l);
-  int vpx_mem_unset_functions(void);
 
-
-  /* some defines for backward compatibility */
-#define DMEM_GENERAL 0
-
-// (*)<
-
-#if REPLACE_BUILTIN_FUNCTIONS
-# ifndef __VPX_MEM_C__
-#  define memalign vpx_memalign
-#  define malloc   vpx_malloc
-#  define calloc   vpx_calloc
-#  define realloc  vpx_realloc
-#  define free     vpx_free
-#  define memcpy   vpx_memcpy
-#  define memmove  vpx_memmove
-#  define memset   vpx_memset
-# endif
-#endif
-
-#if CONFIG_MEM_TRACKER
-#include <stdarg.h>
-  /*from vpx_mem/vpx_mem_tracker.c*/
-  extern void vpx_memory_tracker_dump();
-  extern void vpx_memory_tracker_check_integrity(char *file, unsigned int line);
-  extern int vpx_memory_tracker_set_log_type(int type, char *option);
-  extern int vpx_memory_tracker_set_log_func(void *userdata,
-                                             void(*logfunc)(void *userdata,
-                                                            const char *fmt, va_list args));
-# ifndef __VPX_MEM_C__
-#  define vpx_memalign(align, size) xvpx_memalign((align), (size), __FILE__, __LINE__)
-#  define vpx_malloc(size)          xvpx_malloc((size), __FILE__, __LINE__)
-#  define vpx_calloc(num, size)     xvpx_calloc(num, size, __FILE__, __LINE__)
-#  define vpx_realloc(addr, size)   xvpx_realloc(addr, size, __FILE__, __LINE__)
-#  define vpx_free(addr)            xvpx_free(addr, __FILE__, __LINE__)
-#  define vpx_memory_tracker_check_integrity() vpx_memory_tracker_check_integrity(__FILE__, __LINE__)
-#  define vpx_mem_alloc(id,size,align) xvpx_mem_alloc(id, size, align, __FILE__, __LINE__)
-#  define vpx_mem_free(id,mem,size) xvpx_mem_free(id, mem, size, __FILE__, __LINE__)
-# endif
-
-  void *xvpx_memalign(size_t align, size_t size, char *file, int line);
-  void *xvpx_malloc(size_t size, char *file, int line);
-  void *xvpx_calloc(size_t num, size_t size, char *file, int line);
-  void *xvpx_realloc(void *memblk, size_t size, char *file, int line);
-  void xvpx_free(void *memblk, char *file, int line);
-  void *xvpx_mem_alloc(int id, size_t size, size_t align, char *file, int line);
-  void xvpx_mem_free(int id, void *mem, size_t size, char *file, int line);
-
-#else
-# ifndef __VPX_MEM_C__
-#  define vpx_memory_tracker_dump()
-#  define vpx_memory_tracker_check_integrity()
-#  define vpx_memory_tracker_set_log_type(t,o) 0
-#  define vpx_memory_tracker_set_log_func(u,f) 0
-# endif
-#endif
-
-#if !VPX_CHECK_MEM_FUNCTIONS
-# ifndef __VPX_MEM_C__
-#  include <string.h>
-#  define vpx_memcpy  memcpy
-#  define vpx_memset  memset
-#  define vpx_memmove memmove
-# endif
-#endif
+#include <string.h>
 
 #ifdef VPX_MEM_PLTFRM
 # include VPX_MEM_PLTFRM
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk
index 4663c5a91f0..7f275eabf92 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.mk
@@ -2,21 +2,3 @@ MEM_SRCS-yes += vpx_mem.mk
 MEM_SRCS-yes += vpx_mem.c
 MEM_SRCS-yes += vpx_mem.h
 MEM_SRCS-yes += include/vpx_mem_intrnl.h
-
-MEM_SRCS-$(CONFIG_MEM_TRACKER) += vpx_mem_tracker.c
-MEM_SRCS-$(CONFIG_MEM_TRACKER) += include/vpx_mem_tracker.h
-
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_true.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_resize.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_shrink.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_largest.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_dflt_abort.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_base.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/hmm_intrnl.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/cavl_if.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/hmm_cnfg.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/heapmm.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/include/cavl_impl.h
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_grow.c
-MEM_SRCS-$(CONFIG_MEM_MANAGER) += memory_manager/hmm_alloc.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem_tracker.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem_tracker.c
deleted file mode 100644
index 613e8a16b0e..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem_tracker.c
+++ /dev/null
@@ -1,740 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/*
-  vpx_mem_tracker.c
-
-  jwz 2003-09-30:
-   Stores a list of addreses, their size, and file and line they came from.
-   All exposed lib functions are prefaced by vpx_ and allow the global list
-   to be thread safe.
-   Current supported platforms are:
-    Linux, Win32, win_ce and vx_works
-   Further support can be added by defining the platform specific mutex
-   in the memory_tracker struct as well as calls to create/destroy/lock/unlock
-   the mutex in vpx_memory_tracker_init/Destroy and memory_tracker_lock_mutex/unlock_mutex
-*/
-#include "./vpx_config.h"
-
-#if defined(__uClinux__)
-# include <lddk.h>
-#endif
-
-#if HAVE_PTHREAD_H
-# include <pthread.h>
-#elif defined(WIN32) || defined(_WIN32_WCE)
-# define WIN32_LEAN_AND_MEAN
-# include <windows.h>
-# include <winbase.h>
-#elif defined(VXWORKS)
-# include <sem_lib.h>
-#endif
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h> // VXWORKS doesn't have a malloc/memory.h file,
-// this should pull in malloc,free,etc.
-#include <stdarg.h>
-
-#include "include/vpx_mem_tracker.h"
-
-#undef vpx_malloc   // undefine any vpx_mem macros that may affect calls to
-#undef vpx_free     // memory functions in this file
-#undef vpx_memcpy
-#undef vpx_memset
-
-
-#ifndef USE_GLOBAL_FUNCTION_POINTERS
-# define USE_GLOBAL_FUNCTION_POINTERS   0  // use function pointers instead of compiled functions.
-#endif
-
-#if USE_GLOBAL_FUNCTION_POINTERS
-static mem_track_malloc_func g_malloc   = malloc;
-static mem_track_calloc_func g_calloc   = calloc;
-static mem_track_realloc_func g_realloc = realloc;
-static mem_track_free_func g_free       = free;
-static mem_track_memcpy_func g_memcpy   = memcpy;
-static mem_track_memset_func g_memset   = memset;
-static mem_track_memmove_func g_memmove = memmove;
-# define MEM_TRACK_MALLOC g_malloc
-# define MEM_TRACK_FREE   g_free
-# define MEM_TRACK_MEMCPY g_memcpy
-# define MEM_TRACK_MEMSET g_memset
-#else
-# define MEM_TRACK_MALLOC vpx_malloc
-# define MEM_TRACK_FREE   vpx_free
-# define MEM_TRACK_MEMCPY vpx_memcpy
-# define MEM_TRACK_MEMSET vpx_memset
-#endif // USE_GLOBAL_FUNCTION_POINTERS
-
-/* prototypes for internal library functions */
-static void memtrack_log(const char *fmt, ...);
-static void memory_tracker_dump();
-static void memory_tracker_check_integrity(char *file, unsigned int line);
-static void memory_tracker_add(size_t addr, unsigned int size,
-                               char *file, unsigned int line,
-                               int padded);
-static int memory_tracker_remove(size_t addr);
-static struct mem_block *memory_tracker_find(size_t addr);
-
-#if defined(NO_MUTEX)
-# define memory_tracker_lock_mutex() (!g_b_mem_tracker_inited)
-# define memory_tracker_unlock_mutex()
-#else
-static int memory_tracker_lock_mutex();
-static int memory_tracker_unlock_mutex();
-#endif
-
-#ifndef VPX_NO_GLOBALS
-struct memory_tracker {
-  struct mem_block *head,
-      * tail;
-  int len,
-      totalsize;
-  unsigned int current_allocated,
-           max_allocated;
-
-#if HAVE_PTHREAD_H
-  pthread_mutex_t mutex;
-#elif defined(WIN32) || defined(_WIN32_WCE)
-  HANDLE mutex;
-#elif defined(VXWORKS)
-  SEM_ID mutex;
-#elif defined(NO_MUTEX)
-#else
-#error "No mutex type defined for this platform!"
-#endif
-
-  int padding_size,
-      pad_value;
-};
-
-static struct memory_tracker memtrack;   // our global memory allocation list
-static int g_b_mem_tracker_inited = 0;     // indicates whether the global list has
-// been initialized (1:yes/0:no)
-static struct {
-  FILE *file;
-  int type;
-  void (*func)(void *userdata, const char *fmt, va_list args);
-  void *userdata;
-} g_logging = {NULL, 0, NULL, NULL};
-#else
-# include "vpx_global_handling.h"
-#define g_b_mem_tracker_inited vpxglobalm(vpxmem,g_b_mem_tracker_inited)
-#define g_logging vpxglobalm(vpxmem,g_logging)
-#define memtrack vpxglobalm(vpxmem,memtrack)
-#endif // #ifndef VPX_NO_GLOBALS
-
-extern void *vpx_malloc(size_t size);
-extern void vpx_free(void *memblk);
-extern void *vpx_memcpy(void *dest, const void *src, size_t length);
-extern void *vpx_memset(void *dest, int val, size_t length);
-
-/*
- *
- * Exposed library functions
- *
-*/
-
-/*
-    vpx_memory_tracker_init(int padding_size, int pad_value)
-      padding_size - the size of the padding before and after each mem addr.
-                     Values > 0 indicate that integrity checks can be performed
-                     by inspecting these areas.
-      pad_value - the initial value within the padding area before and after
-                  each mem addr.
-
-    Initializes global memory tracker structure
-    Allocates the head of the list
-*/
-int vpx_memory_tracker_init(int padding_size, int pad_value) {
-  if (!g_b_mem_tracker_inited) {
-    if ((memtrack.head = (struct mem_block *)
-                         MEM_TRACK_MALLOC(sizeof(struct mem_block)))) {
-      int ret;
-
-      MEM_TRACK_MEMSET(memtrack.head, 0, sizeof(struct mem_block));
-
-      memtrack.tail = memtrack.head;
-
-      memtrack.current_allocated = 0;
-      memtrack.max_allocated     = 0;
-
-      memtrack.padding_size = padding_size;
-      memtrack.pad_value    = pad_value;
-
-#if HAVE_PTHREAD_H
-      ret = pthread_mutex_init(&memtrack.mutex,
-                               NULL);            /*mutex attributes (NULL=default)*/
-#elif defined(WIN32) || defined(_WIN32_WCE)
-      memtrack.mutex = CreateMutex(NULL,   /*security attributes*/
-                                   FALSE,  /*we don't want initial ownership*/
-                                   NULL);  /*mutex name*/
-      ret = !memtrack.mutex;
-#elif defined(VXWORKS)
-      memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/
-                                   SEM_FULL);  /*SEM_FULL initial state is unlocked*/
-      ret = !memtrack.mutex;
-#elif defined(NO_MUTEX)
-      ret = 0;
-#endif
-
-      if (ret) {
-        memtrack_log("vpx_memory_tracker_init: Error creating mutex!\n");
-
-        MEM_TRACK_FREE(memtrack.head);
-        memtrack.head = NULL;
-      } else {
-        memtrack_log("Memory Tracker init'd, v."vpx_mem_tracker_version" pad_size:%d pad_val:0x%x %d\n"
-, padding_size
-, pad_value
-, pad_value);
-        g_b_mem_tracker_inited = 1;
-      }
-    }
-  }
-
-  return g_b_mem_tracker_inited;
-}
-
-/*
-    vpx_memory_tracker_destroy()
-    If our global struct was initialized zeros out all its members,
-    frees memory and destroys it's mutex
-*/
-void vpx_memory_tracker_destroy() {
-  if (!memory_tracker_lock_mutex()) {
-    struct mem_block *p  = memtrack.head,
-                          * p2 = memtrack.head;
-
-    memory_tracker_dump();
-
-    while (p) {
-      p2 = p;
-      p  = p->next;
-
-      MEM_TRACK_FREE(p2);
-    }
-
-    memtrack.head              = NULL;
-    memtrack.tail              = NULL;
-    memtrack.len               = 0;
-    memtrack.current_allocated = 0;
-    memtrack.max_allocated     = 0;
-
-    if (!g_logging.type && g_logging.file && g_logging.file != stderr) {
-      fclose(g_logging.file);
-      g_logging.file = NULL;
-    }
-
-    memory_tracker_unlock_mutex();
-
-    g_b_mem_tracker_inited = 0;
-  }
-}
-
-/*
-    vpx_memory_tracker_add(size_t addr, unsigned int size,
-                         char * file, unsigned int line)
-      addr - memory address to be added to list
-      size - size of addr
-      file - the file addr was referenced from
-      line - the line in file addr was referenced from
-    Adds memory address addr, it's size, file and line it came from
-    to the global list via the thread safe internal library function
-*/
-void vpx_memory_tracker_add(size_t addr, unsigned int size,
-                            char *file, unsigned int line,
-                            int padded) {
-  memory_tracker_add(addr, size, file, line, padded);
-}
-
-/*
-    vpx_memory_tracker_remove(size_t addr)
-      addr - memory address to be removed from list
-    Removes addr from the global list via the thread safe
-    internal remove function
-    Return:
-      Same as described for memory_tracker_remove
-*/
-int vpx_memory_tracker_remove(size_t addr) {
-  return memory_tracker_remove(addr);
-}
-
-/*
-    vpx_memory_tracker_find(size_t addr)
-      addr - address to be found in list
-    Return:
-        If found, pointer to the memory block that matches addr
-        NULL otherwise
-*/
-struct mem_block *vpx_memory_tracker_find(size_t addr) {
-  struct mem_block *p = NULL;
-
-  if (!memory_tracker_lock_mutex()) {
-    p = memory_tracker_find(addr);
-    memory_tracker_unlock_mutex();
-  }
-
-  return p;
-}
-
-/*
-    vpx_memory_tracker_dump()
-    Locks the memory tracker's mutex and calls the internal
-    library function to dump the current contents of the
-    global memory allocation list
-*/
-void vpx_memory_tracker_dump() {
-  if (!memory_tracker_lock_mutex()) {
-    memory_tracker_dump();
-    memory_tracker_unlock_mutex();
-  }
-}
-
-/*
-    vpx_memory_tracker_check_integrity(char* file, unsigned int line)
-      file - The file name where the check was placed
-      line - The line in file where the check was placed
-    Locks the memory tracker's mutex and calls the internal
-    integrity check function to inspect every address in the global
-    memory allocation list
-*/
-void vpx_memory_tracker_check_integrity(char *file, unsigned int line) {
-  if (!memory_tracker_lock_mutex()) {
-    memory_tracker_check_integrity(file, line);
-    memory_tracker_unlock_mutex();
-  }
-}
-
-/*
-    vpx_memory_tracker_set_log_type
-    Sets the logging type for the memory tracker. Based on the value it will
-    direct its output to the appropriate place.
-    Return:
-      0: on success
-      -1: if the logging type could not be set, because the value was invalid
-          or because a file could not be opened
-*/
-int vpx_memory_tracker_set_log_type(int type, char *option) {
-  int ret = -1;
-
-  switch (type) {
-    case 0:
-      g_logging.type = 0;
-
-      if (!option) {
-        g_logging.file = stderr;
-        ret = 0;
-      } else {
-        if ((g_logging.file = fopen((char *)option, "w")))
-          ret = 0;
-      }
-
-      break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
-    case 1:
-      g_logging.type = type;
-      ret = 0;
-      break;
-#endif
-    default:
-      break;
-  }
-
-  // output the version to the new logging destination
-  if (!ret)
-    memtrack_log("Memory Tracker logging initialized, "
-                 "Memory Tracker v."vpx_mem_tracker_version"\n");
-
-  return ret;
-}
-
-/*
-    vpx_memory_tracker_set_log_func
-    Sets a logging function to be used by the memory tracker.
-    Return:
-      0: on success
-      -1: if the logging type could not be set because logfunc was NULL
-*/
-int vpx_memory_tracker_set_log_func(void *userdata,
-                                    void(*logfunc)(void *userdata,
-                                                   const char *fmt, va_list args)) {
-  int ret = -1;
-
-  if (logfunc) {
-    g_logging.type     = -1;
-    g_logging.userdata = userdata;
-    g_logging.func     = logfunc;
-    ret = 0;
-  }
-
-  // output the version to the new logging destination
-  if (!ret)
-    memtrack_log("Memory Tracker logging initialized, "
-                 "Memory Tracker v."vpx_mem_tracker_version"\n");
-
-  return ret;
-}
-
-/*
- *
- * END - Exposed library functions
- *
-*/
-
-
-/*
- *
- * Internal library functions
- *
-*/
-
-static void memtrack_log(const char *fmt, ...) {
-  va_list list;
-
-  va_start(list, fmt);
-
-  switch (g_logging.type) {
-    case -1:
-
-      if (g_logging.func)
-        g_logging.func(g_logging.userdata, fmt, list);
-
-      break;
-    case 0:
-
-      if (g_logging.file) {
-        vfprintf(g_logging.file, fmt, list);
-        fflush(g_logging.file);
-      }
-
-      break;
-#if defined(WIN32) && !defined(_WIN32_WCE)
-    case 1: {
-      char temp[1024];
-      _vsnprintf(temp, sizeof(temp) / sizeof(char) - 1, fmt, list);
-      OutputDebugString(temp);
-    }
-    break;
-#endif
-    default:
-      break;
-  }
-
-  va_end(list);
-}
-
-/*
-    memory_tracker_dump()
-    Dumps the current contents of the global memory allocation list
-*/
-static void memory_tracker_dump() {
-  int i = 0;
-  struct mem_block *p = (memtrack.head ? memtrack.head->next : NULL);
-
-  memtrack_log("\n_currently Allocated= %d; Max allocated= %d\n",
-               memtrack.current_allocated, memtrack.max_allocated);
-
-  while (p) {
-#if defined(WIN32) && !defined(_WIN32_WCE)
-
-    /*when using outputdebugstring, output filenames so they
-      can be clicked to be opened in visual studio*/
-    if (g_logging.type == 1)
-      memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file:\n"
-                   "  %s(%d):\n", i,
-                   p->addr, i, p->size,
-                   p->file, p->line);
-    else
-#endif
-      memtrack_log("memblocks[%d].addr= 0x%.8x, memblocks[%d].size= %d, file: %s, line: %d\n", i,
-                   p->addr, i, p->size,
-                   p->file, p->line);
-
-    p = p->next;
-    ++i;
-  }
-
-  memtrack_log("\n");
-}
-
-/*
-    memory_tracker_check_integrity(char* file, unsigned int file)
-      file - the file name where the check was placed
-      line - the line in file where the check was placed
-    If a padding_size was supplied to vpx_memory_tracker_init()
-    this function will check ea. addr in the list verifying that
-    addr-padding_size and addr+padding_size is filled with pad_value
-*/
-static void memory_tracker_check_integrity(char *file, unsigned int line) {
-  if (memtrack.padding_size) {
-    int i,
-        index = 0;
-    unsigned char *p_show_me,
-             * p_show_me2;
-    unsigned int tempme = memtrack.pad_value,
-                 dead1,
-                 dead2;
-    unsigned char *x_bounds;
-    struct mem_block *p = memtrack.head->next;
-
-    while (p) {
-      // x_bounds = (unsigned char*)p->addr;
-      // back up VPX_BYTE_ALIGNMENT
-      // x_bounds -= memtrack.padding_size;
-
-      if (p->padded) { // can the bounds be checked?
-        /*yes, move to the address that was actually allocated
-        by the vpx_* calls*/
-        x_bounds = (unsigned char *)(((size_t *)p->addr)[-1]);
-
-        for (i = 0; i < memtrack.padding_size; i += sizeof(unsigned int)) {
-          p_show_me = (x_bounds + i);
-          p_show_me2 = (unsigned char *)(p->addr + p->size + i);
-
-          MEM_TRACK_MEMCPY(&dead1, p_show_me, sizeof(unsigned int));
-          MEM_TRACK_MEMCPY(&dead2, p_show_me2, sizeof(unsigned int));
-
-          if ((dead1 != tempme) || (dead2 != tempme)) {
-            memtrack_log("\n[vpx_mem integrity check failed]:\n"
-                         "    index[%d,%d] {%s:%d} addr=0x%x, size=%d,"
-                         " file: %s, line: %d c0:0x%x c1:0x%x\n",
-                         index, i, file, line, p->addr, p->size, p->file,
-                         p->line, dead1, dead2);
-          }
-        }
-      }
-
-      ++index;
-      p = p->next;
-    }
-  }
-}
-
-/*
-    memory_tracker_add(size_t addr, unsigned int size,
-                     char * file, unsigned int line)
-    Adds an address (addr), it's size, file and line number to our list.
-    Adjusts the total bytes allocated and max bytes allocated if necessary.
-    If memory cannot be allocated the list will be destroyed.
-*/
-void memory_tracker_add(size_t addr, unsigned int size,
-                        char *file, unsigned int line,
-                        int padded) {
-  if (!memory_tracker_lock_mutex()) {
-    struct mem_block *p;
-
-    p = MEM_TRACK_MALLOC(sizeof(struct mem_block));
-
-    if (p) {
-      p->prev       = memtrack.tail;
-      p->prev->next = p;
-      p->addr       = addr;
-      p->size       = size;
-      p->line       = line;
-      p->file       = file;
-      p->padded     = padded;
-      p->next       = NULL;
-
-      memtrack.tail = p;
-
-      memtrack.current_allocated += size;
-
-      if (memtrack.current_allocated > memtrack.max_allocated)
-        memtrack.max_allocated = memtrack.current_allocated;
-
-      // memtrack_log("memory_tracker_add: added addr=0x%.8x\n", addr);
-
-      memory_tracker_unlock_mutex();
-    } else {
-      memtrack_log("memory_tracker_add: error allocating memory!\n");
-      memory_tracker_unlock_mutex();
-      vpx_memory_tracker_destroy();
-    }
-  }
-}
-
-/*
-    memory_tracker_remove(size_t addr)
-    Removes an address and its corresponding size (if they exist)
-    from the memory tracker list and adjusts the current number
-    of bytes allocated.
-    Return:
-      0: on success
-      -1: if the mutex could not be locked
-      -2: if the addr was not found in the list
-*/
-int memory_tracker_remove(size_t addr) {
-  int ret = -1;
-
-  if (!memory_tracker_lock_mutex()) {
-    struct mem_block *p;
-
-    if ((p = memory_tracker_find(addr))) {
-      memtrack.current_allocated -= p->size;
-
-      p->prev->next = p->next;
-
-      if (p->next)
-        p->next->prev = p->prev;
-      else
-        memtrack.tail = p->prev;
-
-      ret = 0;
-      MEM_TRACK_FREE(p);
-    } else {
-      if (addr)
-        memtrack_log("memory_tracker_remove(): addr not found in list,"
-                     " 0x%.8x\n", addr);
-
-      ret = -2;
-    }
-
-    memory_tracker_unlock_mutex();
-  }
-
-  return ret;
-}
-
-/*
-    memory_tracker_find(size_t addr)
-    Finds an address in our addrs list
-    NOTE: the mutex MUST be locked in the other internal
-          functions before calling this one. This avoids
-          the need for repeated locking and unlocking as in Remove
-    Returns: pointer to the mem block if found, NULL otherwise
-*/
-static struct mem_block *memory_tracker_find(size_t addr) {
-  struct mem_block *p = NULL;
-
-  if (memtrack.head) {
-    p = memtrack.head->next;
-
-    while (p && (p->addr != addr))
-      p = p->next;
-  }
-
-  return p;
-}
-
-
-#if !defined(NO_MUTEX)
-/*
-    memory_tracker_lock_mutex()
-    Locks the memory tracker mutex with a platform specific call
-    Returns:
-        0: Success
-       <0: Failure, either the mutex was not initialized
-           or the call to lock the mutex failed
-*/
-static int memory_tracker_lock_mutex() {
-  int ret = -1;
-
-  if (g_b_mem_tracker_inited) {
-
-#if HAVE_PTHREAD_H
-    ret = pthread_mutex_lock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
-    ret = WaitForSingleObject(memtrack.mutex, INFINITE);
-#elif defined(VXWORKS)
-    ret = sem_take(memtrack.mutex, WAIT_FOREVER);
-#endif
-
-    if (ret) {
-      memtrack_log("memory_tracker_lock_mutex: mutex lock failed\n");
-    }
-  }
-
-  return ret;
-}
-
-/*
-    memory_tracker_unlock_mutex()
-    Unlocks the memory tracker mutex with a platform specific call
-    Returns:
-        0: Success
-       <0: Failure, either the mutex was not initialized
-           or the call to unlock the mutex failed
-*/
-static int memory_tracker_unlock_mutex() {
-  int ret = -1;
-
-  if (g_b_mem_tracker_inited) {
-
-#if HAVE_PTHREAD_H
-    ret = pthread_mutex_unlock(&memtrack.mutex);
-#elif defined(WIN32) || defined(_WIN32_WCE)
-    ret = !ReleaseMutex(memtrack.mutex);
-#elif defined(VXWORKS)
-    ret = sem_give(memtrack.mutex);
-#endif
-
-    if (ret) {
-      memtrack_log("memory_tracker_unlock_mutex: mutex unlock failed\n");
-    }
-  }
-
-  return ret;
-}
-#endif
-
-/*
-    vpx_memory_tracker_set_functions
-
-    Sets the function pointers for the standard library functions.
-
-    Return:
-      0: on success
-      -1: if the use global function pointers is not set.
-*/
-int vpx_memory_tracker_set_functions(mem_track_malloc_func g_malloc_l
-, mem_track_calloc_func g_calloc_l
-, mem_track_realloc_func g_realloc_l
-, mem_track_free_func g_free_l
-, mem_track_memcpy_func g_memcpy_l
-, mem_track_memset_func g_memset_l
-, mem_track_memmove_func g_memmove_l) {
-#if USE_GLOBAL_FUNCTION_POINTERS
-
-  if (g_malloc_l)
-    g_malloc = g_malloc_l;
-
-  if (g_calloc_l)
-    g_calloc = g_calloc_l;
-
-  if (g_realloc_l)
-    g_realloc = g_realloc_l;
-
-  if (g_free_l)
-    g_free = g_free_l;
-
-  if (g_memcpy_l)
-    g_memcpy = g_memcpy_l;
-
-  if (g_memset_l)
-    g_memset = g_memset_l;
-
-  if (g_memmove_l)
-    g_memmove = g_memmove_l;
-
-  return 0;
-#else
-  (void)g_malloc_l;
-  (void)g_calloc_l;
-  (void)g_realloc_l;
-  (void)g_free_l;
-  (void)g_memcpy_l;
-  (void)g_memset_l;
-  (void)g_memmove_l;
-  return -1;
-#endif
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
index f03feffbc24..8a4b8af964f 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c
@@ -49,9 +49,6 @@ int arm_cpu_caps(void) {
     return flags;
   }
   mask = arm_cpu_env_mask();
-#if HAVE_EDSP
-  flags |= HAS_EDSP;
-#endif /* HAVE_EDSP */
 #if HAVE_MEDIA
   flags |= HAS_MEDIA;
 #endif /* HAVE_MEDIA */
@@ -78,17 +75,6 @@ int arm_cpu_caps(void) {
    *  instructions via their assembled hex code.
    * All of these instructions should be essentially nops.
    */
-#if HAVE_EDSP
-  if (mask & HAS_EDSP) {
-    __try {
-      /*PLD [r13]*/
-      __emit(0xF5DDF000);
-      flags |= HAS_EDSP;
-    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-      /*Ignore exception.*/
-    }
-  }
-#endif /* HAVE_EDSP */
 #if HAVE_MEDIA
   if (mask & HAS_MEDIA)
     __try {
@@ -127,9 +113,6 @@ int arm_cpu_caps(void) {
   mask = arm_cpu_env_mask();
   features = android_getCpuFeatures();
 
-#if HAVE_EDSP
-  flags |= HAS_EDSP;
-#endif /* HAVE_EDSP */
 #if HAVE_MEDIA
   flags |= HAS_MEDIA;
 #endif /* HAVE_MEDIA */
@@ -163,23 +146,15 @@ int arm_cpu_caps(void) {
      */
     char buf[512];
     while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM
+#if HAVE_NEON || HAVE_NEON_ASM
       if (memcmp(buf, "Features", 8) == 0) {
         char *p;
-#if HAVE_EDSP
-        p = strstr(buf, " edsp");
-        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= HAS_EDSP;
-        }
-#endif /* HAVE_EDSP */
-#if HAVE_NEON || HAVE_NEON_ASM
         p = strstr(buf, " neon");
         if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
           flags |= HAS_NEON;
         }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
       }
-#endif /* HAVE_EDSP || HAVE_NEON || HAVE_NEON_ASM */
+#endif /* HAVE_NEON || HAVE_NEON_ASM */
 #if HAVE_MEDIA
       if (memcmp(buf, "CPU architecture:", 17) == 0) {
         int version;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/asm_offsets.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/asm_offsets.h
deleted file mode 100644
index 317bbedcb13..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/asm_offsets.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VPX_PORTS_ASM_OFFSETS_H_
-#define VPX_PORTS_ASM_OFFSETS_H_
-
-#include <stddef.h>
-
-#define ct_assert(name,cond) \
-  static void assert_##name(void) UNUSED;\
-  static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
-
-#if INLINE_ASM
-#define DEFINE(sym, val) asm("\n" #sym " EQU %0" : : "i" (val))
-#define BEGIN int main(void) {
-#define END return 0; }
-#else
-#define DEFINE(sym, val) const int sym = val
-#define BEGIN
-#define END
-#endif
-
-#endif  // VPX_PORTS_ASM_OFFSETS_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
index 1cb8c8cd9af..0106a45d6e4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/mem.h
@@ -24,17 +24,6 @@
 #define DECLARE_ALIGNED(n,typ,val)  typ val
 #endif
 
-
-/* Declare an aligned array on the stack, for situations where the stack
- * pointer may not have the alignment we expect. Creates an array with a
- * modified name, then defines val to be a pointer, and aligns that pointer
- * within the array.
- */
-#define DECLARE_ALIGNED_ARRAY(a,typ,val,n)\
-  typ val##_[(n)+(a)/sizeof(typ)+1];\
-  typ *val = (typ*)((((intptr_t)val##_)+(a)-1)&((intptr_t)-(a)))
-
-
 /* Indicates that the usage of the specified variable has been audited to assure
  * that it's safe to use uninitialized. Silences 'may be used uninitialized'
  * warnings on gcc.
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h
index bd9eebd643b..f1df3943457 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_once.h
@@ -110,7 +110,7 @@ static void once(void (*func)(void))
 
 
 #else
-/* No-op version that performs no synchronization. vp8_rtcd() is idempotent,
+/* No-op version that performs no synchronization. *_rtcd() is idempotent,
  * so as long as your platform provides atomic loads/stores of pointers
  * no synchronization is strictly necessary.
  */
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
index 869a204fbc7..a7275431fe9 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_ports.mk
@@ -11,7 +11,6 @@
 
 PORTS_SRCS-yes += vpx_ports.mk
 
-PORTS_SRCS-$(BUILD_LIBVPX) += asm_offsets.h
 PORTS_SRCS-$(BUILD_LIBVPX) += mem.h
 PORTS_SRCS-$(BUILD_LIBVPX) += vpx_timer.h
 
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
index 81c2b8b873f..7d93710c4b0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h
@@ -13,6 +13,7 @@
 #define VPX_PORTS_X86_H_
 #include <stdlib.h>
 #include "vpx_config.h"
+#include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -104,6 +105,37 @@ void __cpuid(int CPUInfo[4], int info_type);
 #endif
 #endif /* end others */
 
+// NaCl has no support for xgetbv or the raw opcode.
+#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
+static INLINE uint64_t xgetbv(void) {
+  const uint32_t ecx = 0;
+  uint32_t eax, edx;
+  // Use the raw opcode for xgetbv for compatibility with older toolchains.
+  __asm__ volatile (
+    ".byte 0x0f, 0x01, 0xd0\n"
+    : "=a"(eax), "=d"(edx) : "c" (ecx));
+  return ((uint64_t)edx << 32) | eax;
+}
+#elif (defined(_M_X64) || defined(_M_IX86)) && \
+      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219  // >= VS2010 SP1
+#include <immintrin.h>
+#define xgetbv() _xgetbv(0)
+#elif defined(_MSC_VER) && defined(_M_IX86)
+static INLINE uint64_t xgetbv(void) {
+  uint32_t eax_, edx_;
+  __asm {
+    xor ecx, ecx  // ecx = 0
+    // Use the raw opcode for xgetbv for compatibility with older toolchains.
+    __asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
+    mov eax_, eax
+    mov edx_, edx
+  }
+  return ((uint64_t)edx_ << 32) | eax_;
+}
+#else
+#define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
+#endif
+
 #define HAS_MMX     0x01
 #define HAS_SSE     0x02
 #define HAS_SSE2    0x04
@@ -120,7 +152,7 @@ static INLINE int
 x86_simd_caps(void) {
   unsigned int flags = 0;
   unsigned int mask = ~0;
-  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  unsigned int max_cpuid_val, reg_eax, reg_ebx, reg_ecx, reg_edx;
   char *env;
   (void)reg_ebx;
 
@@ -136,9 +168,9 @@ x86_simd_caps(void) {
     mask = strtol(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
-  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  cpuid(0, 0, max_cpuid_val, reg_ebx, reg_ecx, reg_edx);
 
-  if (reg_eax < 1)
+  if (max_cpuid_val < 1)
     return 0;
 
   /* Get the standard feature flags */
@@ -156,14 +188,19 @@ x86_simd_caps(void) {
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
-  if (reg_ecx & BIT(28)) flags |= HAS_AVX;
+  // bits 27 (OSXSAVE) & 28 (256-bit AVX)
+  if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
+    if ((xgetbv() & 0x6) == 0x6) {
+      flags |= HAS_AVX;
 
-  /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
-  reg_eax = 7;
-  reg_ecx = 0;
-  cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+      if (max_cpuid_val >= 7) {
+        /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+        cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
-  if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+        if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+      }
+    }
+  }
 
   return flags & mask;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
index 3814ef44380..c94b76a0606 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86_abi_support.asm
@@ -395,7 +395,7 @@ section .text
 
 ; On Android platforms use lrand48 when building postproc routines. Prior to L
 ; rand() was not available.
-%if CONFIG_POSTPROC=1
+%if CONFIG_POSTPROC=1 || CONFIG_VP9_POSTPROC=1
 %ifdef __ANDROID__
 extern sym(lrand48)
 %define LIBVPX_RAND lrand48
@@ -403,4 +403,4 @@ extern sym(lrand48)
 extern sym(rand)
 %define LIBVPX_RAND rand
 %endif
-%endif ; CONFIG_POSTPROC
+%endif ; CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c
index 5f355c5a6bb..995c45b6ab6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/gen_scalers.c
@@ -215,7 +215,7 @@ void vp8_vertical_band_2_1_scale_c(unsigned char *source,
                                    unsigned int dest_width) {
   (void) dest_pitch;
   (void) src_pitch;
-  vpx_memcpy(dest, source, dest_width);
+  memcpy(dest, source, dest_width);
 }
 
 void vp8_vertical_band_2_1_scale_i_c(unsigned char *source,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c
index 8044d2ad776..089e673757c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/vpx_scale.c
@@ -379,7 +379,7 @@ void Scale2D
       vert_band_scale(temp_area + dest_pitch, dest_pitch, dest, dest_pitch, dest_width);
 
       if (interpolation)
-        vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
+        memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_width);
 
       /* Next band... */
       source += (unsigned long) source_band_height  * source_pitch;
@@ -432,7 +432,7 @@ void Scale2D
                  temp_area + i * dest_pitch, 1, hratio, dest_width);
       } else { /*  Duplicate the last row */
         /* copy temp_area row 0 over from last row in the past */
-        vpx_memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
+        memcpy(temp_area + i * dest_pitch, temp_area + (i - 1)*dest_pitch, dest_pitch);
       }
     }
 
@@ -443,7 +443,7 @@ void Scale2D
     }
 
     /* copy temp_area row 0 over from last row in the past */
-    vpx_memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
+    memcpy(temp_area, temp_area + source_band_height * dest_pitch, dest_pitch);
 
     /* move to the next band */
     source += source_band_height * source_pitch;
@@ -498,11 +498,11 @@ void vpx_scale_frame
 
   if (dw < (int)dst->y_width)
     for (i = 0; i < dh; i++)
-      vpx_memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
+      memset(dst->y_buffer + i * dst->y_stride + dw - 1, dst->y_buffer[i * dst->y_stride + dw - 2], dst->y_width - dw + 1);
 
   if (dh < (int)dst->y_height)
     for (i = dh - 1; i < (int)dst->y_height; i++)
-      vpx_memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
+      memcpy(dst->y_buffer + i * dst->y_stride, dst->y_buffer + (dh - 2) * dst->y_stride, dst->y_width + 1);
 
   Scale2D((unsigned char *) src->u_buffer, src->uv_stride, src->uv_width, src->uv_height,
           (unsigned char *) dst->u_buffer, dst->uv_stride, dw / 2, dh / 2,
@@ -510,11 +510,11 @@ void vpx_scale_frame
 
   if (dw / 2 < (int)dst->uv_width)
     for (i = 0; i < dst->uv_height; i++)
-      vpx_memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+      memset(dst->u_buffer + i * dst->uv_stride + dw / 2 - 1, dst->u_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int)dst->uv_height)
     for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-      vpx_memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+      memcpy(dst->u_buffer + i * dst->uv_stride, dst->u_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
 
   Scale2D((unsigned char *) src->v_buffer, src->uv_stride, src->uv_width, src->uv_height,
           (unsigned char *) dst->v_buffer, dst->uv_stride, dw / 2, dh / 2,
@@ -522,9 +522,9 @@ void vpx_scale_frame
 
   if (dw / 2 < (int)dst->uv_width)
     for (i = 0; i < dst->uv_height; i++)
-      vpx_memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
+      memset(dst->v_buffer + i * dst->uv_stride + dw / 2 - 1, dst->v_buffer[i * dst->uv_stride + dw / 2 - 2], dst->uv_width - dw / 2 + 1);
 
   if (dh / 2 < (int) dst->uv_height)
     for (i = dh / 2 - 1; i < (int)dst->y_height / 2; i++)
-      vpx_memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
+      memcpy(dst->v_buffer + i * dst->uv_stride, dst->v_buffer + (dh / 2 - 2)*dst->uv_stride, dst->uv_width);
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c
index 00a8c163a6a..169c2ab2d73 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12config.c
@@ -38,7 +38,7 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
       all of this so that a freed pointer isn't inadvertently used */
-    vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
   } else {
     return -1;
   }
@@ -128,7 +128,7 @@ int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
       all of this so that a freed pointer isn't inadvertently used */
-    vpx_memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
+    memset(ybf, 0, sizeof(YV12_BUFFER_CONFIG));
   } else {
     return -1;
   }
@@ -143,22 +143,25 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int use_highbitdepth,
 #endif
                              int border,
+                             int byte_alignment,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb,
                              void *cb_priv) {
   if (ybf) {
+    const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
     const int aligned_width = (width + 7) & ~7;
     const int aligned_height = (height + 7) & ~7;
     const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
     const uint64_t yplane_size = (aligned_height + 2 * border) *
-                                 (uint64_t)y_stride;
+                                 (uint64_t)y_stride + byte_alignment;
     const int uv_width = aligned_width >> ss_x;
     const int uv_height = aligned_height >> ss_y;
     const int uv_stride = y_stride >> ss_x;
     const int uv_border_w = border >> ss_x;
     const int uv_border_h = border >> ss_y;
     const uint64_t uvplane_size = (uv_height + 2 * uv_border_h) *
-                                  (uint64_t)uv_stride;
+                                  (uint64_t)uv_stride + byte_alignment;
+
 #if CONFIG_ALPHA
     const int alpha_width = aligned_width;
     const int alpha_height = aligned_height;
@@ -166,7 +169,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     const int alpha_border_w = border;
     const int alpha_border_h = border;
     const uint64_t alpha_plane_size = (alpha_height + 2 * alpha_border_h) *
-                                      (uint64_t)alpha_stride;
+                                      (uint64_t)alpha_stride + byte_alignment;
 #if CONFIG_VP9_HIGHBITDEPTH
     const uint64_t frame_size = (1 + use_highbitdepth) *
         (yplane_size + 2 * uvplane_size + alpha_plane_size);
@@ -182,6 +185,9 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     const uint64_t frame_size = yplane_size + 2 * uvplane_size;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_ALPHA
+
+    uint8_t *buf = NULL;
+
     if (cb != NULL) {
       const int align_addr_extra_size = 31;
       const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -216,7 +222,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
       // This memset is needed for fixing valgrind error from C loop filter
       // due to access uninitialized memory in frame border. It could be
       // removed if border is totally removed.
-      vpx_memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
+      memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
     }
 
     /* Only support allocating buffers that have a border that's a multiple
@@ -244,38 +250,33 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
     ybf->subsampling_x = ss_x;
     ybf->subsampling_y = ss_y;
 
+    buf = ybf->buffer_alloc;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_highbitdepth) {
       // Store uint16 addresses when using 16bit framebuffers
-      uint8_t *p = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
-      ybf->y_buffer = p + (border * y_stride) + border;
-      ybf->u_buffer = p + yplane_size +
-          (uv_border_h * uv_stride) + uv_border_w;
-      ybf->v_buffer = p + yplane_size + uvplane_size +
-          (uv_border_h * uv_stride) + uv_border_w;
+      buf = CONVERT_TO_BYTEPTR(ybf->buffer_alloc);
       ybf->flags = YV12_FLAG_HIGHBITDEPTH;
     } else {
-      ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
-      ybf->u_buffer = ybf->buffer_alloc + yplane_size +
-          (uv_border_h * uv_stride) + uv_border_w;
-      ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
-          (uv_border_h * uv_stride) + uv_border_w;
       ybf->flags = 0;
     }
-#else
-    ybf->y_buffer = ybf->buffer_alloc + (border * y_stride) + border;
-    ybf->u_buffer = ybf->buffer_alloc + yplane_size +
-                    (uv_border_h * uv_stride) + uv_border_w;
-    ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size +
-                    (uv_border_h * uv_stride) + uv_border_w;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+    ybf->y_buffer = (uint8_t *)yv12_align_addr(
+        buf + (border * y_stride) + border, vp9_byte_align);
+    ybf->u_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + (uv_border_h * uv_stride) + uv_border_w,
+        vp9_byte_align);
+    ybf->v_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + uvplane_size + (uv_border_h * uv_stride) +
+        uv_border_w, vp9_byte_align);
+
 #if CONFIG_ALPHA
     ybf->alpha_width = alpha_width;
     ybf->alpha_height = alpha_height;
     ybf->alpha_stride = alpha_stride;
-    ybf->alpha_buffer = ybf->buffer_alloc + yplane_size + 2 * uvplane_size +
-                        (alpha_border_h * alpha_stride) + alpha_border_w;
+    ybf->alpha_buffer = (uint8_t *)yv12_align_addr(
+        buf + yplane_size + 2 * uvplane_size +
+        (alpha_border_h * alpha_stride) + alpha_border_w, vp9_byte_align);
 #endif
     ybf->corrupted = 0; /* assume not corrupted by errors */
     return 0;
@@ -289,14 +290,15 @@ int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
 #if CONFIG_VP9_HIGHBITDEPTH
                            int use_highbitdepth,
 #endif
-                           int border) {
+                           int border,
+                           int byte_alignment) {
   if (ybf) {
     vp9_free_frame_buffer(ybf);
     return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                                     use_highbitdepth,
 #endif
-                                    border, NULL, NULL, NULL);
+                                    border, byte_alignment, NULL, NULL, NULL);
   }
   return -2;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c
index 0485452aec3..6214a12189c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/generic/yv12extend.c
@@ -31,8 +31,8 @@ static void extend_plane(uint8_t *const src, int src_stride,
   uint8_t *dst_ptr2 = src + width;
 
   for (i = 0; i < height; ++i) {
-    vpx_memset(dst_ptr1, src_ptr1[0], extend_left);
-    vpx_memset(dst_ptr2, src_ptr2[0], extend_right);
+    memset(dst_ptr1, src_ptr1[0], extend_left);
+    memset(dst_ptr2, src_ptr2[0], extend_right);
     src_ptr1 += src_stride;
     src_ptr2 += src_stride;
     dst_ptr1 += src_stride;
@@ -48,12 +48,12 @@ static void extend_plane(uint8_t *const src, int src_stride,
   dst_ptr2 = src + src_stride * height - extend_left;
 
   for (i = 0; i < extend_top; ++i) {
-    vpx_memcpy(dst_ptr1, src_ptr1, linesize);
+    memcpy(dst_ptr1, src_ptr1, linesize);
     dst_ptr1 += src_stride;
   }
 
   for (i = 0; i < extend_bottom; ++i) {
-    vpx_memcpy(dst_ptr2, src_ptr2, linesize);
+    memcpy(dst_ptr2, src_ptr2, linesize);
     dst_ptr2 += src_stride;
   }
 }
@@ -91,12 +91,12 @@ static void extend_plane_high(uint8_t *const src8, int src_stride,
   dst_ptr2 = src + src_stride * height - extend_left;
 
   for (i = 0; i < extend_top; ++i) {
-    vpx_memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
+    memcpy(dst_ptr1, src_ptr1, linesize * sizeof(uint16_t));
     dst_ptr1 += src_stride;
   }
 
   for (i = 0; i < extend_bottom; ++i) {
-    vpx_memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
+    memcpy(dst_ptr2, src_ptr2, linesize * sizeof(uint16_t));
     dst_ptr2 += src_stride;
   }
 }
@@ -122,17 +122,17 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
 
     extend_plane_high(
         ybf->u_buffer, ybf->uv_stride,
-        (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
-        ybf->border / 2, ybf->border / 2,
-        (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
-        (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+        ybf->uv_crop_width, ybf->uv_crop_height,
+        uv_border, uv_border,
+        uv_border + ybf->uv_height - ybf->uv_crop_height,
+        uv_border + ybf->uv_width - ybf->uv_crop_width);
 
     extend_plane_high(
         ybf->v_buffer, ybf->uv_stride,
-        (ybf->y_crop_width + 1) / 2, (ybf->y_crop_height + 1) / 2,
-        ybf->border / 2, ybf->border / 2,
-        (ybf->border + ybf->y_height - ybf->y_crop_height + 1) / 2,
-        (ybf->border + ybf->y_width - ybf->y_crop_width + 1) / 2);
+        ybf->uv_crop_width, ybf->uv_crop_height,
+        uv_border, uv_border,
+        uv_border + ybf->uv_height - ybf->uv_crop_height,
+        uv_border + ybf->uv_width - ybf->uv_crop_width);
     return;
   }
 #endif
@@ -212,7 +212,7 @@ void vp9_extend_frame_inner_borders_c(YV12_BUFFER_CONFIG *ybf) {
 void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  vpx_memcpy(dst, src, num * sizeof(uint16_t));
+  memcpy(dst, src, num * sizeof(uint16_t));
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_VP9
@@ -269,7 +269,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
 #endif
 
   for (row = 0; row < src_ybc->y_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->y_width);
+    memcpy(dst, src, src_ybc->y_width);
     src += src_ybc->y_stride;
     dst += dst_ybc->y_stride;
   }
@@ -278,7 +278,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
   dst = dst_ybc->u_buffer;
 
   for (row = 0; row < src_ybc->uv_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->uv_width);
+    memcpy(dst, src, src_ybc->uv_width);
     src += src_ybc->uv_stride;
     dst += dst_ybc->uv_stride;
   }
@@ -287,7 +287,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
   dst = dst_ybc->v_buffer;
 
   for (row = 0; row < src_ybc->uv_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->uv_width);
+    memcpy(dst, src, src_ybc->uv_width);
     src += src_ybc->uv_stride;
     dst += dst_ybc->uv_stride;
   }
@@ -306,7 +306,7 @@ void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
     const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
     for (row = 0; row < src_ybc->y_height; ++row) {
-      vpx_memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
+      memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t));
       src16 += src_ybc->y_stride;
       dst16 += dst_ybc->y_stride;
     }
@@ -315,7 +315,7 @@ void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
 #endif
 
   for (row = 0; row < src_ybc->y_height; ++row) {
-    vpx_memcpy(dst, src, src_ybc->y_width);
+    memcpy(dst, src, src_ybc->y_width);
     src += src_ybc->y_stride;
     dst += dst_ybc->y_stride;
   }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
index 0dfc47cc874..19f84cb1960 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c
@@ -94,12 +94,12 @@ static void extend_plane(uint8_t *const src, int src_stride,
   linesize = extend_left + extend_right + width;
 
   for (i = 0; i < extend_top; i++) {
-    vpx_memcpy(top_dst, top_src, linesize);
+    memcpy(top_dst, top_src, linesize);
     top_dst += src_stride;
   }
 
   for (i = 0; i < extend_bottom; i++) {
-    vpx_memcpy(bot_dst, bot_src, linesize);
+    memcpy(bot_dst, bot_src, linesize);
     bot_dst += src_stride;
   }
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk
index 0a1594bd8f5..a49abf3b4b7 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale.mk
@@ -1,11 +1,10 @@
 SCALE_SRCS-yes += vpx_scale.mk
 SCALE_SRCS-yes += yv12config.h
-SCALE_SRCS-yes += vpx_scale.h
-SCALE_SRCS-yes += generic/vpx_scale.c
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += vpx_scale.h
+SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/vpx_scale.c
 SCALE_SRCS-yes += generic/yv12config.c
 SCALE_SRCS-yes += generic/yv12extend.c
 SCALE_SRCS-$(CONFIG_SPATIAL_RESAMPLING) += generic/gen_scalers.c
-SCALE_SRCS-yes += vpx_scale_asm_offsets.c
 SCALE_SRCS-yes += vpx_scale_rtcd.c
 SCALE_SRCS-yes += vpx_scale_rtcd.pl
 
@@ -14,7 +13,4 @@ SCALE_SRCS-$(HAVE_DSPR2)  += mips/dspr2/yv12extend_dspr2.c
 
 SCALE_SRCS-no += $(SCALE_SRCS_REMOVE-yes)
 
-$(eval $(call asm_offsets_template,\
-	         vpx_scale_asm_offsets.asm, vpx_scale/vpx_scale_asm_offsets.c))
-
 $(eval $(call rtcd_h_template,vpx_scale_rtcd,vpx_scale/vpx_scale_rtcd.pl))
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_asm_offsets.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_asm_offsets.c
deleted file mode 100644
index caa9e80ffcb..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_asm_offsets.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx/vpx_codec.h"
-#include "vpx_ports/asm_offsets.h"
-#include "vpx_scale/yv12config.h"
-
-BEGIN
-
-/* vpx_scale */
-DEFINE(yv12_buffer_config_y_width,              offsetof(YV12_BUFFER_CONFIG, y_width));
-DEFINE(yv12_buffer_config_y_height,             offsetof(YV12_BUFFER_CONFIG, y_height));
-DEFINE(yv12_buffer_config_y_stride,             offsetof(YV12_BUFFER_CONFIG, y_stride));
-DEFINE(yv12_buffer_config_uv_width,             offsetof(YV12_BUFFER_CONFIG, uv_width));
-DEFINE(yv12_buffer_config_uv_height,            offsetof(YV12_BUFFER_CONFIG, uv_height));
-DEFINE(yv12_buffer_config_uv_stride,            offsetof(YV12_BUFFER_CONFIG, uv_stride));
-DEFINE(yv12_buffer_config_y_buffer,             offsetof(YV12_BUFFER_CONFIG, y_buffer));
-DEFINE(yv12_buffer_config_u_buffer,             offsetof(YV12_BUFFER_CONFIG, u_buffer));
-DEFINE(yv12_buffer_config_v_buffer,             offsetof(YV12_BUFFER_CONFIG, v_buffer));
-DEFINE(yv12_buffer_config_border,               offsetof(YV12_BUFFER_CONFIG, border));
-DEFINE(VP8BORDERINPIXELS_VAL,                   VP8BORDERINPIXELS);
-
-END
-
-/* add asserts for any offset that is not supported by assembly code */
-/* add asserts for any size that is not supported by assembly code */
-
-#if HAVE_NEON
-/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
-ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
-#endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c
index 656a22f5240..bea603fd104 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/vpx_scale_rtcd.c
@@ -7,9 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vpx_scale_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
 void vpx_scale_rtcd()
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/win32/scaleopt.c b/chromium/third_party/libvpx/source/libvpx/vpx_scale/win32/scaleopt.c
deleted file mode 100644
index 4336ecea35d..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/win32/scaleopt.c
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     scaleopt.cpp
-*
-*   Description  :     Optimized scaling functions
-*
-****************************************************************************/
-#include "pragmas.h"
-
-/****************************************************************************
-*  Module Statics
-****************************************************************************/
-__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
-
-#include "vpx_scale/vpx_scale.h"
-#include "vpx_mem/vpx_mem.h"
-
-__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
-__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_5_4_scale_mmx
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 4 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_5_4_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-  /*
-  unsigned i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
-  (void) dest_width;
-
-  for ( i=0; i<source_width; i+=5 )
-  {
-      a = src[0];
-      b = src[1];
-      c = src[2];
-      d = src[3];
-      e = src[4];
-
-      des[0] = a;
-      des[1] = ((b*192 + c* 64 + 128)>>8);
-      des[2] = ((c*128 + d*128 + 128)>>8);
-      des[3] = ((d* 64 + e*192 + 128)>>8);
-
-      src += 5;
-      des += 4;
-  }
-  */
-  (void) dest_width;
-
-  __asm {
-
-    mov         esi,        source;
-    mov         edi,        dest;
-
-    mov         ecx,        source_width;
-    movq        mm5,        const54_1;
-
-    pxor        mm7,        mm7;
-    movq        mm6,        const54_2;
-
-    movq        mm4,        round_values;
-    lea         edx,        [esi+ecx];
-    horizontal_line_5_4_loop:
-
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psrlq       mm0,        8;
-    01 02 03 04 05 06 07 xx
-    punpcklbw   mm1,        mm7;
-    xx 00 xx 01 xx 02 xx 03
-
-    punpcklbw   mm0,        mm7;
-    xx 01 xx 02 xx 03 xx 04
-    pmullw      mm1,        mm5
-
-    pmullw      mm0,        mm6
-    add         esi,        5
-
-    add         edi,        4
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    cmp         esi,        edx
-    packuswb    mm1,        mm7
-
-    movd        DWORD PTR [edi-4], mm1
-
-    jl          horizontal_line_5_4_loop
-
-  }
-
-}
-__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
-__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
-
-static
-void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  __asm {
-    push        ebx
-
-    mov         esi,    source                    // Get the source and destination pointer
-    mov         ecx,    src_pitch               // Get the pitch size
-
-    mov         edi,    dest                    // tow lines below
-    pxor        mm7,    mm7                     // clear out mm7
-
-    mov         edx,    dest_pitch               // Loop counter
-    mov         ebx,    dest_width
-
-    vs_5_4_loop:
-
-    movd        mm0,    DWORD ptr [esi]         // src[0];
-    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-    movd        mm2,    DWORD ptr [esi+ecx*2]
-    lea         eax,    [esi+ecx*2]             //
-
-    punpcklbw   mm1,    mm7
-    punpcklbw   mm2,    mm7
-
-    movq        mm3,    mm2
-    pmullw      mm1,    three_fourths
-
-    pmullw      mm2,    one_fourths
-    movd        mm4,    [eax+ecx]
-
-    pmullw      mm3,    two_fourths
-    punpcklbw   mm4,    mm7
-
-    movq        mm5,    mm4
-    pmullw      mm4,    two_fourths
-
-    paddw       mm1,    mm2
-    movd        mm6,    [eax+ecx*2]
-
-    pmullw      mm5,    one_fourths
-    paddw       mm1,    round_values;
-
-    paddw       mm3,    mm4
-    psrlw       mm1,    8
-
-    punpcklbw   mm6,    mm7
-    paddw       mm3,    round_values
-
-    pmullw      mm6,    three_fourths
-    psrlw       mm3,    8
-
-    packuswb    mm1,    mm7
-    packuswb    mm3,    mm7
-
-    movd        DWORD PTR [edi], mm0
-    movd        DWORD PTR [edi+edx], mm1
-
-
-    paddw       mm5,    mm6
-    movd        DWORD PTR [edi+edx*2], mm3
-
-    lea         eax,    [edi+edx*2]
-    paddw       mm5,    round_values
-
-    psrlw       mm5,    8
-    add         edi,    4
-
-    packuswb    mm5,    mm7
-    movd        DWORD PTR [eax+edx], mm5
-
-    add         esi,    4
-    sub         ebx,    4
-
-    jg         vs_5_4_loop
-
-    pop         ebx
-  }
-}
-
-
-__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
-__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
-
-
-static
-void horizontal_line_5_3_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-
-  (void) dest_width;
-  __asm {
-
-    mov         esi,        source;
-    mov         edi,        dest;
-
-    mov         ecx,        source_width;
-    movq        mm5,        const53_1;
-
-    pxor        mm7,        mm7;
-    movq        mm6,        const53_2;
-
-    movq        mm4,        round_values;
-    lea         edx,        [esi+ecx-5];
-    horizontal_line_5_3_loop:
-
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psllw       mm0,        8;
-    xx 00 xx 02 xx 04 xx 06
-    psrlw       mm1,        8;
-    01 xx 03 xx 05 xx 07 xx
-
-    psrlw       mm0,        8;
-    00 xx 02 xx 04 xx 06 xx
-    psllq       mm1,        16;
-    xx xx 01 xx 03 xx 05 xx
-
-    pmullw      mm0,        mm6
-
-    pmullw      mm1,        mm5
-    add         esi,        5
-
-    add         edi,        3
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    cmp         esi,        edx
-    packuswb    mm1,        mm7
-
-    movd        DWORD PTR [edi-3], mm1
-    jl          horizontal_line_5_3_loop
-
-// exit condition
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psllw       mm0,        8;
-    xx 00 xx 02 xx 04 xx 06
-    psrlw       mm1,        8;
-    01 xx 03 xx 05 xx 07 xx
-
-    psrlw       mm0,        8;
-    00 xx 02 xx 04 xx 06 xx
-    psllq       mm1,        16;
-    xx xx 01 xx 03 xx 05 xx
-
-    pmullw      mm0,        mm6
-
-    pmullw      mm1,        mm5
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    packuswb    mm1,        mm7
-    movd        eax,        mm1
-
-    mov         edx,        eax
-    shr         edx,        16
-
-    mov         WORD PTR[edi],   ax
-    mov         BYTE PTR[edi+2], dl
-
-  }
-
-}
-
-__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
-__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
-
-static
-void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  __asm {
-    push        ebx
-
-    mov         esi,    source                    // Get the source and destination pointer
-    mov         ecx,    src_pitch               // Get the pitch size
-
-    mov         edi,    dest                    // tow lines below
-    pxor        mm7,    mm7                     // clear out mm7
-
-    mov         edx,    dest_pitch               // Loop counter
-    movq        mm5,    one_thirds
-
-    movq        mm6,    two_thirds
-    mov         ebx,    dest_width;
-
-    vs_5_3_loop:
-
-    movd        mm0,    DWORD ptr [esi]         // src[0];
-    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-    movd        mm2,    DWORD ptr [esi+ecx*2]
-    lea         eax,    [esi+ecx*2]             //
-
-    punpcklbw   mm1,    mm7
-    punpcklbw   mm2,    mm7
-
-    pmullw      mm1,    mm5
-    pmullw      mm2,    mm6
-
-    movd        mm3,    DWORD ptr [eax+ecx]
-    movd        mm4,    DWORD ptr [eax+ecx*2]
-
-    punpcklbw   mm3,    mm7
-    punpcklbw   mm4,    mm7
-
-    pmullw      mm3,    mm6
-    pmullw      mm4,    mm5
-
-
-    movd        DWORD PTR [edi], mm0
-    paddw       mm1,    mm2
-
-    paddw       mm1,    round_values
-    psrlw       mm1,    8
-
-    packuswb    mm1,    mm7
-    paddw       mm3,    mm4
-
-    paddw       mm3,    round_values
-    movd        DWORD PTR [edi+edx], mm1
-
-    psrlw       mm3,    8
-    packuswb    mm3,    mm7
-
-    movd        DWORD PTR [edi+edx*2], mm3
-
-
-    add         edi,    4
-    add         esi,    4
-
-    sub         ebx,    4
-    jg          vs_5_3_loop
-
-    pop         ebx
-  }
-}
-
-
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_2_1_scale
- *
- *  INPUTS        : const unsigned char *source :
- *                  unsigned int source_width    :
- *                  unsigned char *dest         :
- *                  unsigned int dest_width      :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_2_1_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-  (void) dest_width;
-  (void) source_width;
-  __asm {
-    mov         esi,    source
-    mov         edi,    dest
-
-    pxor        mm7,    mm7
-    mov         ecx,    dest_width
-
-    xor         edx,    edx
-    hs_2_1_loop:
-
-    movq        mm0,    [esi+edx*2]
-    psllw       mm0,    8
-
-    psrlw       mm0,    8
-    packuswb    mm0,    mm7
-
-    movd        DWORD Ptr [edi+edx], mm0;
-    add         edx,    4
-
-    cmp         edx,    ecx
-    jl          hs_2_1_loop
-
-  }
-}
-
-
-
-static
-void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-  (void) dest_pitch;
-  (void) src_pitch;
-  vpx_memcpy(dest, source, dest_width);
-}
-
-
-__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
-__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
-
-static
-void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  (void) dest_pitch;
-  __asm {
-    mov         esi,        source
-    mov         edi,        dest
-
-    mov         eax,        src_pitch
-    mov         edx,        dest_width
-
-    pxor        mm7,        mm7
-    sub         esi,        eax             // back one line
-
-
-    lea         ecx,        [esi+edx];
-    movq        mm6,        round_values;
-
-    movq        mm5,        three_sixteenths;
-    movq        mm4,        ten_sixteenths;
-
-    vs_2_1_i_loop:
-    movd        mm0,        [esi]           //
-    movd        mm1,        [esi+eax]       //
-
-    movd        mm2,        [esi+eax*2]     //
-    punpcklbw   mm0,        mm7
-
-    pmullw      mm0,        mm5
-    punpcklbw   mm1,        mm7
-
-    pmullw      mm1,        mm4
-    punpcklbw   mm2,        mm7
-
-    pmullw      mm2,        mm5
-    paddw       mm0,        round_values
-
-    paddw       mm1,        mm2
-    paddw       mm0,        mm1
-
-    psrlw       mm0,        8
-    packuswb    mm0,        mm7
-
-    movd        DWORD PTR [edi],        mm0
-    add         esi,        4
-
-    add         edi,        4;
-    cmp         esi,        ecx
-    jl          vs_2_1_i_loop
-
-  }
-}
-
-
-
-void
-register_mmxscalers(void) {
-  vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
-  vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
-  vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
-  vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
-  vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
-  vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
-  vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h b/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h
index b9f13fd8918..76cf771c74a 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_scale/yv12config.h
@@ -55,12 +55,13 @@ typedef struct yv12_buffer_config {
   int subsampling_x;
   int subsampling_y;
   unsigned int bit_depth;
+  vpx_color_space_t color_space;
 
   int corrupted;
   int flags;
 } YV12_BUFFER_CONFIG;
 
-#define YV12_FLAG_HIGHBITDEPTH 1
+#define YV12_FLAG_HIGHBITDEPTH 8
 
 int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                 int width, int height, int border);
@@ -73,9 +74,10 @@ int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
 #if CONFIG_VP9_HIGHBITDEPTH
                            int use_highbitdepth,
 #endif
-                           int border);
+                           int border, int byte_alignment);
 
-// Updates the yv12 buffer config with the frame buffer. If cb is not
+// Updates the yv12 buffer config with the frame buffer. |byte_alignment| must
+// be a power of 2, from 32 to 1024. 0 sets legacy alignment. If cb is not
 // NULL, then libvpx is using the frame buffer callbacks to handle memory.
 // If cb is not NULL, libvpx will call cb with minimum size in bytes needed
 // to decode the current frame. If cb is NULL, libvpx will allocate memory
@@ -87,6 +89,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int use_highbitdepth,
 #endif
                              int border,
+                             int byte_alignment,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb,
                              void *cb_priv);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxdec.c b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
index 2afdb715b82..8c938df8de6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxdec.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxdec.c
@@ -75,6 +75,8 @@ static const arg_def_t outputfile = ARG_DEF(
     "o", "output", 1, "Output file name pattern (see below)");
 static const arg_def_t threadsarg = ARG_DEF(
     "t", "threads", 1, "Max threads to use");
+static const arg_def_t frameparallelarg = ARG_DEF(
+    NULL, "frame-parallel", 0, "Frame parallel decode");
 static const arg_def_t verbosearg = ARG_DEF(
     "v", "verbose", 0, "Show version string");
 static const arg_def_t error_concealment = ARG_DEF(
@@ -95,7 +97,7 @@ static const arg_def_t outbitdeptharg = ARG_DEF(
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
   &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &verbosearg, &scalearg, &fb_arg,
+  &threadsarg, &frameparallelarg, &verbosearg, &scalearg, &fb_arg,
   &md5arg, &error_concealment, &continuearg,
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   &outbitdeptharg,
@@ -131,7 +133,7 @@ static const arg_def_t *vp8_pp_args[] = {
 #endif
 
 #if CONFIG_LIBYUV
-static INLINE int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst,
+static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
                                   FilterModeEnum mode) {
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   if (src->fmt == VPX_IMG_FMT_I42016) {
@@ -276,7 +278,8 @@ static void update_image_md5(const vpx_image_t *img, const int planes[3],
     const int plane = planes[i];
     const unsigned char *buf = img->planes[plane];
     const int stride = img->stride[plane];
-    const int w = vpx_img_plane_width(img, plane);
+    const int w = vpx_img_plane_width(img, plane) *
+                ((img->fmt & VPX_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
     const int h = vpx_img_plane_height(img, plane);
 
     for (y = 0; y < h; ++y) {
@@ -518,7 +521,7 @@ static FILE *open_outfile(const char *name) {
   } else {
     FILE *file = fopen(name, "wb");
     if (!file)
-      fatal("Failed to output file %s", name);
+      fatal("Failed to open output file '%s'", name);
     return file;
   }
 }
@@ -541,7 +544,7 @@ int main_loop(int argc, const char **argv_) {
   size_t                 bytes_in_buffer = 0, buffer_size = 0;
   FILE                  *infile;
   int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
-  int                    do_md5 = 0, progress = 0;
+  int                    do_md5 = 0, progress = 0, frame_parallel = 0;
   int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
   int                    arg_skip = 0;
   int                    ec_enabled = 0;
@@ -574,7 +577,7 @@ int main_loop(int argc, const char **argv_) {
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   vpx_image_t             *img_shifted = NULL;
 #endif
-  int                     frame_avail, got_data;
+  int                     frame_avail, got_data, flush_decoder = 0;
   int                     num_external_frame_buffers = 0;
   struct ExternalFrameBufferList ext_fb_list = {0, NULL};
 
@@ -641,6 +644,10 @@ int main_loop(int argc, const char **argv_) {
       summary = 1;
     else if (arg_match(&arg, &threadsarg, argi))
       cfg.threads = arg_parse_uint(&arg);
+#if CONFIG_VP9_DECODER
+    else if (arg_match(&arg, &frameparallelarg, argi))
+      frame_parallel = 1;
+#endif
     else if (arg_match(&arg, &verbosearg, argi))
       quiet = 0;
     else if (arg_match(&arg, &scalearg, argi))
@@ -717,15 +724,15 @@ int main_loop(int argc, const char **argv_) {
   /* Handle non-option arguments */
   fn = argv[0];
 
-  if (!fn)
+  if (!fn) {
+    free(argv);
     usage_exit();
-
+  }
   /* Open file */
   infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin);
 
   if (!infile) {
-    fprintf(stderr, "Failed to open file '%s'", strcmp(fn, "-") ? fn : "stdin");
-    return EXIT_FAILURE;
+    fatal("Failed to open input file '%s'", strcmp(fn, "-") ? fn : "stdin");
   }
 #if CONFIG_OS_SUPPORT
   /* Make sure we don't dump to the terminal, unless forced to with -o - */
@@ -793,7 +800,8 @@ int main_loop(int argc, const char **argv_) {
     interface = get_vpx_decoder_by_index(0);
 
   dec_flags = (postproc ? VPX_CODEC_USE_POSTPROC : 0) |
-              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0);
+              (ec_enabled ? VPX_CODEC_USE_ERROR_CONCEALMENT : 0) |
+              (frame_parallel ? VPX_CODEC_USE_FRAME_THREADING : 0);
   if (vpx_codec_dec_init(&decoder, interface->codec_interface(),
                          &cfg, dec_flags)) {
     fprintf(stderr, "Failed to initialize decoder: %s\n",
@@ -867,7 +875,7 @@ int main_loop(int argc, const char **argv_) {
     vpx_codec_iter_t  iter = NULL;
     vpx_image_t    *img;
     struct vpx_usec_timer timer;
-    int                   corrupted;
+    int                   corrupted = 0;
 
     frame_avail = 0;
     if (!stop_after || frame_in < stop_after) {
@@ -891,11 +899,22 @@ int main_loop(int argc, const char **argv_) {
 
         vpx_usec_timer_mark(&timer);
         dx_time += vpx_usec_timer_elapsed(&timer);
+      } else {
+        flush_decoder = 1;
       }
+    } else {
+      flush_decoder = 1;
     }
 
     vpx_usec_timer_start(&timer);
 
+    if (flush_decoder) {
+      // Flush the decoder in frame parallel decode.
+      if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) {
+        warn("Failed to flush decoder: %s", vpx_codec_error(&decoder));
+      }
+    }
+
     got_data = 0;
     if ((img = vpx_codec_get_frame(&decoder, &iter))) {
       ++frame_out;
@@ -905,9 +924,11 @@ int main_loop(int argc, const char **argv_) {
     vpx_usec_timer_mark(&timer);
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
-    if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
+    if (!frame_parallel &&
+        vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
       warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
-      goto fail;
+      if (!keep_going)
+        goto fail;
     }
     frames_corrupted += corrupted;
 
@@ -947,7 +968,7 @@ int main_loop(int argc, const char **argv_) {
 
         if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {
 #if CONFIG_LIBYUV
-          vpx_image_scale(img, scaled_img, kFilterBox);
+          libyuv_scale(img, scaled_img, kFilterBox);
           img = scaled_img;
 #else
           fprintf(stderr, "Failed  to scale output frame: %s.\n"
@@ -1059,9 +1080,6 @@ int main_loop(int argc, const char **argv_) {
         }
       }
     }
-
-    if (stop_after && frame_in >= stop_after)
-      break;
   }
 
   if (summary || progress) {
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
index 0a0c0718bc5..851d43291cd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c
@@ -183,8 +183,10 @@ static const arg_def_t recontest = ARG_DEF_ENUM(
     NULL, "test-decode", 1, "Test encode/decode mismatch", test_decode_enum);
 static const arg_def_t framerate = ARG_DEF(
     NULL, "fps", 1, "Stream frame rate (rate/scale)");
+static const arg_def_t use_webm = ARG_DEF(
+    NULL, "webm", 0, "Output WebM (default when WebM IO is enabled)");
 static const arg_def_t use_ivf = ARG_DEF(
-    NULL, "ivf", 0, "Output IVF (default is WebM if WebM IO is enabled)");
+    NULL, "ivf", 0, "Output IVF");
 static const arg_def_t out_part = ARG_DEF(
     "P", "output-partitions", 0,
     "Makes encoder output partitions. Requires IVF output!");
@@ -208,7 +210,7 @@ static const arg_def_t *main_args[] = {
   &debugmode,
   &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &skip,
   &deadline, &best_dl, &good_dl, &rt_dl,
-  &quietarg, &verbosearg, &psnrarg, &use_ivf, &out_part, &q_hist_n,
+  &quietarg, &verbosearg, &psnrarg, &use_webm, &use_ivf, &out_part, &q_hist_n,
   &rate_hist_n, &disable_warnings, &disable_warning_prompt,
   NULL
 };
@@ -328,8 +330,10 @@ static const arg_def_t sharpness = ARG_DEF(
     NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
 static const arg_def_t static_thresh = ARG_DEF(
     NULL, "static-thresh", 1, "Motion detection threshold");
-static const arg_def_t cpu_used = ARG_DEF(
+static const arg_def_t cpu_used_vp8 = ARG_DEF(
     NULL, "cpu-used", 1, "CPU Used (-16..16)");
+static const arg_def_t cpu_used_vp9 = ARG_DEF(
+    NULL, "cpu-used", 1, "CPU Used (-8..8)");
 static const arg_def_t auto_altref = ARG_DEF(
     NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes = ARG_DEF(
@@ -349,14 +353,21 @@ static const arg_def_t cq_level = ARG_DEF(
     NULL, "cq-level", 1, "Constant/Constrained Quality level");
 static const arg_def_t max_intra_rate_pct = ARG_DEF(
     NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)");
+static const arg_def_t max_inter_rate_pct = ARG_DEF(
+    NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
+    NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
+
+static const arg_def_t screen_content_mode = ARG_DEF(NULL, "screen-content-mode", 1,
+                                                     "Screen content mode");
 
 #if CONFIG_VP8_ENCODER
 static const arg_def_t token_parts = ARG_DEF(
     NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t *vp8_args[] = {
-  &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
+  &cpu_used_vp8, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &token_parts, &arnr_maxframes, &arnr_strength, &arnr_type,
-  &tune_ssim, &cq_level, &max_intra_rate_pct,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &screen_content_mode,
   NULL
 };
 static const int vp8_arg_ctrl_map[] = {
@@ -365,6 +376,7 @@ static const int vp8_arg_ctrl_map[] = {
   VP8E_SET_TOKEN_PARTITIONS,
   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP8E_SET_SCREEN_CONTENT_MODE,
   0
 };
 #endif
@@ -386,6 +398,22 @@ static const arg_def_t frame_periodic_boost = ARG_DEF(
     NULL, "frame-boost", 1,
     "Enable frame periodic boost (0: off (default), 1: on)");
 
+static const struct arg_enum_list color_space_enum[] = {
+  { "unknown", VPX_CS_UNKNOWN },
+  { "bt601", VPX_CS_BT_601 },
+  { "bt709", VPX_CS_BT_709 },
+  { "smpte170", VPX_CS_SMPTE_170 },
+  { "smpte240", VPX_CS_SMPTE_240 },
+  { "bt2020", VPX_CS_BT_2020 },
+  { "reserved", VPX_CS_RESERVED },
+  { "sRGB", VPX_CS_SRGB },
+  { NULL, 0 }
+};
+
+static const arg_def_t input_color_space = ARG_DEF_ENUM(
+    NULL, "color-space", 1,
+    "The color space of input content:", color_space_enum);
+
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
   {"8",  VPX_BITS_8},
@@ -412,11 +440,12 @@ static const arg_def_t tune_content = ARG_DEF_ENUM(
     NULL, "tune-content", 1, "Tune content type", tune_content_enum);
 
 static const arg_def_t *vp9_args[] = {
-  &cpu_used, &auto_altref, &sharpness, &static_thresh,
+  &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
-  &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
+  &tune_ssim, &cq_level, &max_intra_rate_pct, &max_inter_rate_pct,
+  &gf_cbr_boost_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
-  &noise_sens, &tune_content,
+  &noise_sens, &tune_content, &input_color_space,
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   &bitdeptharg, &inbitdeptharg,
 #endif
@@ -428,9 +457,10 @@ static const int vp9_arg_ctrl_map[] = {
   VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+  VP9E_SET_MAX_INTER_BITRATE_PCT, VP9E_SET_GF_CBR_BOOST_PCT,
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
-  VP9E_SET_TUNE_CONTENT,
+  VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
   0
 };
 #endif
@@ -1038,8 +1068,7 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
       continue;
     }
 
-    if (0) {
-    } else if (arg_match(&arg, &outputfile, argi)) {
+    if (arg_match(&arg, &outputfile, argi)) {
       config->out_fn = arg.val;
     } else if (arg_match(&arg, &fpf_name, argi)) {
       config->stats_fn = arg.val;
@@ -1047,6 +1076,12 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
     } else if (arg_match(&arg, &fpmbf_name, argi)) {
       config->fpmb_stats_fn = arg.val;
 #endif
+    } else if (arg_match(&arg, &use_webm, argi)) {
+#if CONFIG_WEBM_IO
+      config->write_webm = 1;
+#else
+      die("Error: --webm specified but webm is disabled.");
+#endif
     } else if (arg_match(&arg, &use_ivf, argi)) {
       config->write_webm = 0;
     } else if (arg_match(&arg, &threads, argi)) {
@@ -1187,6 +1222,7 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
 static void validate_stream_config(const struct stream_state *stream,
                                    const struct VpxEncoderConfig *global) {
   const struct stream_state *streami;
+  (void)global;
 
   if (!stream->config.cfg.g_w || !stream->config.cfg.g_h)
     fatal("Stream %d: Specify stream dimensions with --width (-w) "
diff --git a/chromium/third_party/libvpx/source/libvpx/vpxstats.c b/chromium/third_party/libvpx/source/libvpx/vpxstats.c
index 5f88f8d3597..172d8937cdc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpxstats.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpxstats.c
@@ -41,6 +41,9 @@ int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
 
     stats->file = fopen(fpf, "rb");
 
+    if (stats->file == NULL)
+      fatal("First-pass stats file does not exist!");
+
     if (fseek(stats->file, 0, SEEK_END))
       fatal("First-pass stats file must be seekable!");
 
diff --git a/chromium/third_party/libvpx/source/libvpx/webmdec.cc b/chromium/third_party/libvpx/source/libvpx/webmdec.cc
index 4383e8efd8d..e152f5ee03a 100644
--- a/chromium/third_party/libvpx/source/libvpx/webmdec.cc
+++ b/chromium/third_party/libvpx/source/libvpx/webmdec.cc
@@ -41,6 +41,7 @@ void reset(struct WebmInputContext *const webm_ctx) {
   webm_ctx->block_frame_index = 0;
   webm_ctx->video_track_index = 0;
   webm_ctx->timestamp_ns = 0;
+  webm_ctx->is_key_frame = false;
 }
 
 void get_first_cluster(struct WebmInputContext *const webm_ctx) {
@@ -62,6 +63,7 @@ int file_is_webm(struct WebmInputContext *webm_ctx,
                  struct VpxInputContext *vpx_ctx) {
   mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file);
   webm_ctx->reader = reader;
+  webm_ctx->reached_eos = 0;
 
   mkvparser::EBMLHeader header;
   long long pos = 0;
@@ -120,6 +122,11 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
                     uint8_t **buffer,
                     size_t *bytes_in_buffer,
                     size_t *buffer_size) {
+  // This check is needed for frame parallel decoding, in which case this
+  // function could be called even after it has reached end of input stream.
+  if (webm_ctx->reached_eos) {
+    return 1;
+  }
   mkvparser::Segment *const segment =
       reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
   const mkvparser::Cluster* cluster =
@@ -139,6 +146,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
       cluster = segment->GetNext(cluster);
       if (cluster == NULL || cluster->EOS()) {
         *bytes_in_buffer = 0;
+        webm_ctx->reached_eos = 1;
         return 1;
       }
       status = cluster->GetFirst(block_entry);
@@ -182,6 +190,7 @@ int webm_read_frame(struct WebmInputContext *webm_ctx,
   }
   *bytes_in_buffer = frame.len;
   webm_ctx->timestamp_ns = block->GetTime(cluster);
+  webm_ctx->is_key_frame = block->IsKey();
 
   mkvparser::MkvReader *const reader =
       reinterpret_cast<mkvparser::MkvReader*>(webm_ctx->reader);
@@ -210,6 +219,7 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx,
   webm_ctx->block_entry = NULL;
   webm_ctx->block_frame_index = 0;
   webm_ctx->timestamp_ns = 0;
+  webm_ctx->reached_eos = 0;
 
   return 0;
 }
diff --git a/chromium/third_party/libvpx/source/libvpx/webmdec.h b/chromium/third_party/libvpx/source/libvpx/webmdec.h
index 29b815da125..7d163803552 100644
--- a/chromium/third_party/libvpx/source/libvpx/webmdec.h
+++ b/chromium/third_party/libvpx/source/libvpx/webmdec.h
@@ -28,6 +28,8 @@ struct WebmInputContext {
   int block_frame_index;
   int video_track_index;
   uint64_t timestamp_ns;
+  int is_key_frame;
+  int reached_eos;
 };
 
 // Checks if the input is a WebM file. If so, initializes WebMInputContext so
author	Allan Sandfeld Jensen <allan.jensen@theqtcompany.com>	2015-06-18 14:10:49 +0200
committer	Oswald Buddenhagen <oswald.buddenhagen@theqtcompany.com>	2015-06-18 13:53:24 +0000
commit	813fbf95af77a531c57a8c497345ad2c61d475b3 (patch)
tree	821b2c8de8365f21b6c9ba17a236fb3006a1d506 /chromium/third_party/libvpx/source/libvpx
parent	af6588f8d723931a298c995fa97259bb7f7deb55 (diff)
download	qtwebengine-chromium-813fbf95af77a531c57a8c497345ad2c61d475b3.tar.gz