Update to new stable branch 1750

This also includes an updated ninja and chromium dependencies needed on Windows. Change-Id: Icd597d80ed3fa4425933c9f1334c3c2e31291c42 Reviewed-by: Zoltan Arvai <zarvai@inf.u-szeged.hu> Reviewed-by: Zeno Albisser <zeno.albisser@digia.com>
author: Andras Becsi <andras.becsi@digia.com> 2014-03-18 13:16:26 +0100
committer: Frederik Gladhorn <frederik.gladhorn@digia.com> 2014-03-20 15:55:39 +0100
commit: 3f0f86b0caed75241fa71c95a5d73bc0164348c5 (patch)
tree: 92b9fb00f2e9e90b0be2262093876d4f43b6cd13 /chromium/third_party/libyuv
parent: e90d7c4b152c56919d963987e2503f9909a666d2 (diff)
download: qtwebengine-chromium-3f0f86b0caed75241fa71c95a5d73bc0164348c5.tar.gz
66 files changed, 12329 insertions, 7506 deletions
diff --git a/chromium/third_party/libyuv/Android.mk b/chromium/third_party/libyuv/Android.mk
index 513a1961b5c..3d8ba49a318 100644
--- a/chromium/third_party/libyuv/Android.mk
+++ b/chromium/third_party/libyuv/Android.mk
@@ -27,7 +27,9 @@ LOCAL_SRC_FILES := \
     source/row_posix.cc         \
     source/scale.cc             \
     source/scale_argb.cc        \
+    source/scale_common.cc      \
     source/scale_mips.cc        \
+    source/scale_posix.cc       \
     source/video_common.cc
 
 # TODO(fbarchard): Enable mjpeg encoder.
@@ -41,7 +43,6 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
         source/compare_neon.cc.neon    \
         source/rotate_neon.cc.neon     \
         source/row_neon.cc.neon        \
-        source/scale_argb_neon.cc.neon \
         source/scale_neon.cc.neon
 endif
 
diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS
index eafc459c3f3..7e866873c45 100644
--- a/chromium/third_party/libyuv/DEPS
+++ b/chromium/third_party/libyuv/DEPS
@@ -13,7 +13,7 @@ vars = {
   "googlecode_url": "http://%s.googlecode.com/svn",
   "chromium_trunk" : "http://src.chromium.org/svn/trunk",
   # chrome://version/ for revision of canary Chrome.
-  "chromium_revision": "202548",
+  "chromium_revision": "232627",
 }
 
 # NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
@@ -78,6 +78,26 @@ deps_os = {
     "third_party/gold":
       From("chromium_deps", "src/third_party/gold"),
   },
+  "android": {
+    "third_party/android_tools":
+      From("chromium_deps", "src/third_party/android_tools"),
+  },
+  "ios": {
+    # NSS, for SSLClientSocketNSS.
+    "third_party/nss":
+      From("chromium_deps", "src/third_party/nss"),
+
+    "net/third_party/nss":
+      Var("chromium_trunk") + "/src/net/third_party/nss@" + Var("chromium_revision"),
+
+    # class-dump utility to generate header files for undocumented SDKs.
+    "testing/iossim/third_party/class-dump":
+      From("chromium_deps", "src/testing/iossim/third_party/class-dump"),
+
+    # Helper for running under the simulator.
+    "testing/iossim":
+      Var("chromium_trunk") + "/src/testing/iossim@" + Var("chromium_revision"),
+  },
 }
 
 hooks = [
@@ -92,7 +112,7 @@ hooks = [
     # A change to a .gyp, .gypi, or to GYP itself should run the generator.
     "pattern": ".",
     "action": ["python", Var("root_dir") + "/build/gyp_chromium",
-               "--depth=" + Var("root_dir"), Var("root_dir") + "/libyuv_test.gyp",
+               "--depth=" + Var("root_dir"), Var("root_dir") + "/all.gyp",
                Var("extra_gyp_flag")],
   },
   {
diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS
index cbe985ecfdd..df673dfa5e5 100644
--- a/chromium/third_party/libyuv/OWNERS
+++ b/chromium/third_party/libyuv/OWNERS
@@ -1,3 +1,2 @@
 fbarchard@chromium.org
 mflodman@chromium.org
-
diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium
index edc5d82ba88..f11363cc425 100644
--- a/chromium/third_party/libyuv/README.chromium
+++ b/chromium/third_party/libyuv/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 723
+Version: 911
 License: BSD
 License File: LICENSE
 
diff --git a/chromium/third_party/libyuv/all.gyp b/chromium/third_party/libyuv/all.gyp
new file mode 100644
index 00000000000..cc72d9d6fc3
--- /dev/null
+++ b/chromium/third_party/libyuv/all.gyp
@@ -0,0 +1,21 @@
+# Copyright 2013 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# all.gyp and All target are for benefit of android gyp build.
+{
+  'targets': [
+    {
+      'target_name': 'All',
+      'type': 'none',
+      'dependencies': [
+        'libyuv.gyp:*',
+        'libyuv_test.gyp:*',
+      ],
+    },
+  ],
+}
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
index be3bba44433..f0343a77d3e 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -1,168 +1,168 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
-#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB to ARGB.
-#define ARGBToARGB ARGBCopy
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
-
-// Convert ARGB To BGRA. (alias)
-#define ARGBToBGRA BGRAToARGB
-LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert ARGB To ABGR. (alias)
-#define ARGBToABGR ABGRToARGB
-LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert ARGB To RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height);
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_rgb, int dst_stride_rgb,
-              int width, int height);
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height);
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height);
-
-// Convert ARGB To I444.
-LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I422.
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I420. (also in convert.h)
-LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I411.
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB to J400. (JPeg full range).
-LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height);
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
-
-// Convert ARGB To NV12.
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
-
-// Convert ARGB To YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height);
-
-// Convert ARGB To UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height);
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+             uint8* dst_argb, int dst_stride_argb,
+             int width, int height);
+
+// Convert ARGB To BGRA. (alias)
+#define ARGBToBGRA BGRAToARGB
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert ARGB To ABGR. (alias)
+#define ARGBToABGR ABGRToARGB
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
+               uint8* dst_argb, int dst_stride_argb,
+               int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+                uint8* dst_rgb24, int dst_stride_rgb24,
+                int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_rgb, int dst_stride_rgb,
+              int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_rgb565, int dst_stride_rgb565,
+                 int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb1555, int dst_stride_argb1555,
+                   int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb4444, int dst_stride_argb4444,
+                   int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_u, int dst_stride_u,
+               uint8* dst_v, int dst_stride_v,
+               int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yj, int dst_stride_yj,
+               int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_uv, int dst_stride_uv,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_y, int dst_stride_y,
+               uint8* dst_vu, int dst_stride_vu,
+               int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_yuy2, int dst_stride_yuy2,
+               int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+               uint8* dst_uyvy, int dst_stride_uyvy,
+               int width, int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
index 8b6d043222b..79da994c744 100644
--- a/chromium/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
@@ -18,6 +18,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// TODO(fbarchard): Consider overlapping bits for different architectures.
 // Internal flag to indicate cpuid requires initialization.
 static const int kCpuInit = 0x1;
 
@@ -35,11 +36,13 @@ static const int kCpuHasSSE42 = 0x100;
 static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
 
 // These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x1000;
-static const int kCpuHasMIPS_DSP = 0x2000;
-static const int kCpuHasMIPS_DSPR2 = 0x4000;
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;
 
 // Internal function used to auto-init.
 LIBYUV_API
@@ -65,8 +68,10 @@ LIBYUV_API
 void MaskCpuFlags(int enable_flags);
 
 // Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
 LIBYUV_API
-void CpuId(int cpu_info[4], int info_type);
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index e53c1fe1e2e..7bb82fce146 100644
--- a/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -45,7 +45,7 @@ struct SetJmpErrorMgr;
 // MJPEG frames.
 //
 // See http://tools.ietf.org/html/rfc2435
-class MJpegDecoder {
+class LIBYUV_API MJpegDecoder {
  public:
   typedef void (*CallbackFunction)(void* opaque,
                                    const uint8* const* data,
diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
index cb14678a8b3..1d54ddec147 100644
--- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
@@ -72,6 +72,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
                int width, int height);
 
 // Convert UYVY to I422.
+LIBYUV_API
 int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_u, int dst_stride_u,
@@ -187,14 +188,27 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
               int x, int y, int width, int height);
 
 // Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
+                    const int8* matrix_argb,
+                    int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
 // matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
 // The first 4 coefficients apply to B, G, R, A and produce B of the output.
 // The next 4 coefficients apply to B, G, R, A and produce G of the output.
 // The last 4 coefficients apply to B, G, R, A and produce R of the output.
 LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int x, int y, int width, int height);
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int x, int y, int width, int height);
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
@@ -203,6 +217,36 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
                    const uint8* table_argb,
                    int x, int y, int width, int height);
 
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma_rgb_table,
+                       int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared.  The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height);
+
 // Quantize a rectangle of ARGB. Alpha unaffected.
 // scale is a 16 bit fractional fixed point scaler between 0 and 65535.
 // interval_size should be a value between 1 and 255.
@@ -218,6 +262,18 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
              int width, int height);
 
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height);
+
 typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
                              uint8* dst_argb, int width);
 
@@ -288,6 +344,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
                uint8* argb, int argb_stride,
                int w, int h, int dw, int dh);
 
+// Internal function - do not call directly.
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
@@ -296,8 +353,11 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
                              int width, int height);
 
 // Blur ARGB image.
-// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
-// to 16 byte boundary.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+//   16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
 LIBYUV_API
 int ARGBBlur(const uint8* src_argb, int src_stride_argb,
              uint8* dst_argb, int dst_stride_argb,
@@ -347,6 +407,12 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
                 uint8* dst_argb, int dst_stride_argb,
                 const uint8* shuffler, int width, int height);
 
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height);
+
 // Sobel ARGB effect.
 LIBYUV_API
 int ARGBSobel(const uint8* src_argb, int src_stride_argb,
diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h
index 3416661742f..b6056fdca9b 100644
--- a/chromium/third_party/libyuv/include/libyuv/row.h
+++ b/chromium/third_party/libyuv/include/libyuv/row.h
@@ -38,32 +38,66 @@ extern "C" {
 // The following are available on all x86 platforms:
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Conversions.
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+
+// Conversions:
 #define HAS_ABGRTOUVROW_SSSE3
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSE2
 #define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTOBAYERGGROW_SSE2
 #define HAS_ARGBTOBAYERROW_SSSE3
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565ROW_SSE2
 #define HAS_ARGBTOUV422ROW_SSSE3
 #define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
 #define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOYROW_SSSE3
 #define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
-#define HAS_COPYROW_ERMS
+#define HAS_FIXEDDIV_X86
 #define HAS_HALFROW_SSE2
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOABGRROW_SSSE3
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
@@ -77,7 +111,9 @@ extern "C" {
 #define HAS_I422TOYUY2ROW_SSE2
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSE2
 #define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
 #define HAS_NV12TORGB565ROW_SSSE3
@@ -101,43 +137,48 @@ extern "C" {
 #define HAS_YUY2TOUV422ROW_SSE2
 #define HAS_YUY2TOUVROW_SSE2
 #define HAS_YUY2TOYROW_SSE2
-
-// Effects
-#define HAS_ARGBADDROW_SSE2
-#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
-#define HAS_ARGBBLENDROW_SSSE3
-#define HAS_ARGBCOLORMATRIXROW_SSSE3
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSSE3
-#define HAS_ARGBMULTIPLYROW_SSE2
-#define HAS_ARGBQUANTIZEROW_SSE2
-#define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELXROW_SSSE3
-#define HAS_SOBELXYROW_SSE2
-#define HAS_SOBELYROW_SSSE3
 #endif
 
-// The following are Windows only.
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_ARGBCOLORTABLEROW_X86
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif  // GNUC >= 4.7
+#endif  // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif  // clang >= 3.4
+#endif  // __clang__
+
 // Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif  // VisualStudio >= 2012
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+// Effects:
+#define HAS_ARGBPOLYNOMIALROW_AVX2
 #define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#endif
+
+// The following are require VS2012.
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
 #define HAS_ARGBTOUVROW_AVX2
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_HALFROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_SPLITUVROW_AVX2
@@ -148,17 +189,16 @@ extern "C" {
 #define HAS_YUY2TOUVROW_AVX2
 #define HAS_YUY2TOYROW_AVX2
 
-// Effects
+// Effects:
 #define HAS_ARGBADDROW_AVX2
 #define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBMIRRORROW_AVX2
 #define HAS_ARGBMULTIPLYROW_AVX2
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
-#endif
-#endif
+#endif  // defined(VISUALC_HAS_AVX2)
 
-// The following are Yasm x86 only.
+// The following are Yasm x86 only:
 // TODO(fbarchard): Port AVX2 to inline.
 #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
     (defined(_M_IX86) || defined(_M_X64) || \
@@ -177,12 +217,12 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
     !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBATTENUATEROW_SSE2
 #define HAS_ARGBBLENDROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #endif
 
-// The following are available on Neon platforms
+// The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOUVROW_NEON
@@ -255,7 +295,7 @@ extern "C" {
 #define HAS_YUY2TOUVROW_NEON
 #define HAS_YUY2TOYROW_NEON
 
-// Effects
+// Effects:
 #define HAS_ARGBADDROW_NEON
 #define HAS_ARGBATTENUATEROW_NEON
 #define HAS_ARGBBLENDROW_NEON
@@ -268,13 +308,14 @@ extern "C" {
 #define HAS_ARGBSHADEROW_NEON
 #define HAS_ARGBSUBTRACTROW_NEON
 #define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
 #define HAS_SOBELXYROW_NEON
 #define HAS_SOBELXROW_NEON
 #define HAS_SOBELYROW_NEON
 #define HAS_INTERPOLATEROW_NEON
 #endif
 
-// The following are available on Mips platforms
+// The following are available on Mips platforms:
 #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
 #define HAS_COPYROW_MIPS
 #if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@@ -304,6 +345,7 @@ typedef __declspec(align(32)) uint32 ulvec32[8];
 typedef __declspec(align(32)) uint8 ulvec8[32];
 
 #elif defined(__GNUC__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 typedef int16 __attribute__((vector_size(16))) vec16;
 typedef int32 __attribute__((vector_size(16))) vec32;
@@ -327,6 +369,14 @@ typedef uint8 uvec8[16];
 #define OMITFP __attribute__((optimize("omit-frame-pointer")))
 #endif
 
+// For functions that use rowbuffer and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
 void I444ToARGBRow_NEON(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
@@ -655,6 +705,14 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count);
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
 void CopyRow_C(const uint8* src, uint8* dst, int count);
 
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+
 void SetRow_X86(uint8* dst, uint32 v32, int count);
 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
                      int dst_stride, int height);
@@ -668,6 +726,8 @@ void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
 // ARGBShufflers for BGRAToARGB etc.
 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
                       const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix);
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix);
 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
@@ -676,6 +736,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix);
 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                     const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                             const uint8* shuffler, int pix);
 void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
                               const uint8* shuffler, int pix);
 void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
@@ -1338,8 +1400,16 @@ void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
                               uint32 selector, int pix);
 void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
                              uint32 selector, int pix);
+void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
+                        uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+                           uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                            uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
+                               uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
+                               uint32 /* selector */, int pix);
 
 void I422ToYUY2Row_C(const uint8* src_y,
                      const uint8* src_u,
@@ -1398,7 +1468,7 @@ void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
                                int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
-extern uint32 fixed_invtbl8[256];
+extern const uint32 fixed_invtbl8[256];
 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
@@ -1415,15 +1485,19 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width);
 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
 void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
 
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width);
-void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
-                             int width);
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width);
 
 void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
 
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
 void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
                        int interval_offset, int width);
 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
@@ -1466,6 +1540,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride_ptr, int width,
                           int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                         ptrdiff_t src_stride_ptr, int width,
+                         int source_y_fraction);
 void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride_ptr, int width,
                          int source_y_fraction);
@@ -1487,6 +1564,9 @@ void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                               ptrdiff_t src_stride_ptr, int width,
                               int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                             ptrdiff_t src_stride_ptr, int width,
+                             int source_y_fraction);
 void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
                                     ptrdiff_t src_stride_ptr, int width,
                                     int source_y_fraction);
@@ -1494,14 +1574,14 @@ void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
 // Sobel images.
 void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
                  uint8* dst_sobelx, int width);
-void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
-                     const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width);
 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobelx, int width);
 void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
                  uint8* dst_sobely, int width);
-void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
-                     uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width);
 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width);
 void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
@@ -1510,6 +1590,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width);
 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                    uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width);
 void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
                   uint8* dst_argb, int width);
 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
@@ -1517,6 +1603,31 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
                      uint8* dst_argb, int width);
 
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, const uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width, const uint8* luma,
+                                 const uint32 lumacoeff);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#else
+#define FixedDiv FixedDiv_C
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libyuv/include/libyuv/scale.h
index b1efc95d2fd..b672dbfcee8 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale.h
@@ -18,11 +18,12 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Supported filtering
+// Supported filtering.
 enum FilterMode {
   kFilterNone = 0,  // Point sample; Fastest.
-  kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 2  // Highest quality.
+  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
+  kFilterBox = 3  // Highest quality.
 };
 
 // Scale a YUV plane.
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_argb.h b/chromium/third_party/libyuv/include/libyuv/scale_argb.h
index b6f510522e7..0c9b3625757 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale_argb.h
@@ -35,6 +35,20 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
                   int clip_x, int clip_y, int clip_width, int clip_height,
                   enum FilterMode filtering);
 
+// TODO(fbarchard): Implement this.
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+                       const uint8* src_u, int src_stride_u,
+                       const uint8* src_v, int src_stride_v,
+                       uint32 src_fourcc,
+                       int src_width, int src_height,
+                       uint8* dst_argb, int dst_stride_argb,
+                       uint32 dst_fourcc,
+                       int dst_width, int dst_height,
+                       int clip_x, int clip_y, int clip_width, int clip_height,
+                       enum FilterMode filtering);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 00000000000..23c4e90791f
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,273 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN4_SSE2
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEADDROWS_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__)
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+FilterMode ScaleFilterReduce(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             FilterMode filtering);
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                FilterMode filtering,
+                int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                     uint8* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                     uint8* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                      uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                      uint8* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width);
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t /* src_stride */,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx);
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+                                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width);
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width,
+                       int src_height);
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int /* x */, int /* dx */);
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t /* src_stride */,
+                            uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t /* src_stride */,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int /* x */, int /* dx */);
+// Row functions.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                            uint8* dst, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst, int dst_width);
+
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                        uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                              uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                              uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                 uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                               uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h
index 31cf78fc591..3bb834f9448 100644
--- a/chromium/third_party/libyuv/include/libyuv/version.h
+++ b/chromium/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 723
+#define LIBYUV_VERSION 911
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/chromium/third_party/libyuv/libyuv.gyp b/chromium/third_party/libyuv/libyuv.gyp
index ad6b78b5c3e..4130bd0d3f0 100644
--- a/chromium/third_party/libyuv/libyuv.gyp
+++ b/chromium/third_party/libyuv/libyuv.gyp
@@ -74,6 +74,7 @@
         'include/libyuv/row.h',
         'include/libyuv/scale.h',
         'include/libyuv/scale_argb.h',
+        'include/libyuv/scale_row.h',
         'include/libyuv/version.h',
         'include/libyuv/video_common.h',
 
@@ -107,9 +108,11 @@
         'source/row_win.cc',
         'source/scale.cc',
         'source/scale_argb.cc',
-        'source/scale_argb_neon.cc',
+        'source/scale_common.cc',
         'source/scale_mips.cc',
         'source/scale_neon.cc',
+        'source/scale_posix.cc',
+        'source/scale_win.cc',
         'source/video_common.cc',
       ],
     },
diff --git a/chromium/third_party/libyuv/libyuv_test.gyp b/chromium/third_party/libyuv/libyuv_test.gyp
index 447881a4480..906fc5f8b0d 100644
--- a/chromium/third_party/libyuv/libyuv_test.gyp
+++ b/chromium/third_party/libyuv/libyuv_test.gyp
@@ -35,6 +35,7 @@
         'unit_test/compare_test.cc',
         'unit_test/convert_test.cc',
         'unit_test/cpu_test.cc',
+        'unit_test/math_test.cc',
         'unit_test/planar_test.cc',
         'unit_test/rotate_argb_test.cc',
         'unit_test/rotate_test.cc',
diff --git a/chromium/third_party/libyuv/linux.mk b/chromium/third_party/libyuv/linux.mk
new file mode 100644
index 00000000000..5d12135a85d
--- /dev/null
+++ b/chromium/third_party/libyuv/linux.mk
@@ -0,0 +1,48 @@
+# This is a generic makefile for libyuv for gcc.
+# make -f linux.mk CC=clang++
+
+CC=g++
+CCFLAGS=-O2 -fomit-frame-pointer -Iinclude/
+
+LOCAL_OBJ_FILES := \
+    source/compare.o           \
+    source/compare_common.o    \
+    source/compare_posix.o     \
+    source/convert.o           \
+    source/convert_argb.o      \
+    source/convert_from.o      \
+    source/convert_from_argb.o \
+    source/convert_to_argb.o   \
+    source/convert_to_i420.o   \
+    source/cpu_id.o            \
+    source/format_conversion.o \
+    source/planar_functions.o  \
+    source/rotate.o            \
+    source/rotate_argb.o       \
+    source/rotate_mips.o       \
+    source/row_any.o           \
+    source/row_common.o        \
+    source/row_mips.o          \
+    source/row_posix.o         \
+    source/scale.o             \
+    source/scale_argb.o        \
+    source/scale_common.o      \
+    source/scale_mips.o        \
+    source/scale_posix.o       \
+    source/video_common.o
+
+.cc.o:
+	$(CC) -c $(CCFLAGS) $*.cc -o $*.o
+
+all: libyuv.a convert linux.mk
+
+libyuv.a: $(LOCAL_OBJ_FILES) linux.mk
+	$(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES)
+
+# A test utility that uses libyuv conversion.
+convert: util/convert.cc linux.mk
+	$(CC) $(CCFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
+
+clean:
+	/bin/rm -f source/*.o *.ii *.s libyuv.a convert
+
diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc
index f8b358309e5..7d844ee08a6 100644
--- a/chromium/third_party/libyuv/source/compare.cc
+++ b/chromium/third_party/libyuv/source/compare.cc
@@ -30,12 +30,17 @@ extern "C" {
 uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
 
 // This module is for Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
     (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
 #define HAS_HASHDJB2_SSE41
-
 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
 
+#if _MSC_VER >= 1700
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
 #endif  // HAS_HASHDJB2_SSE41
 
 // hash seed of 5381 recommended.
@@ -47,6 +52,11 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
     HashDjb2_SSE = HashDjb2_SSE41;
   }
 #endif
+#if defined(HAS_HASHDJB2_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HashDjb2_SSE = HashDjb2_AVX2;
+  }
+#endif
 
   const int kBlockSize = 1 << 15;  // 32768;
   while (count >= static_cast<uint64>(kBlockSize)) {
@@ -73,8 +83,8 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
 #define HAS_SUMSQUAREERROR_NEON
 uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
 #endif
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
-    defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
 #define HAS_SUMSQUAREERROR_SSE2
 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
 #endif
@@ -138,7 +148,9 @@ LIBYUV_API
 uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
                                   const uint8* src_b, int stride_b,
                                   int width, int height) {
-  if (stride_a == width && stride_b == width) {
+  // Coalesce rows.
+  if (stride_a == width &&
+      stride_b == width) {
     return ComputeSumSquareError(src_a, src_b, width * height);
   }
   uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
diff --git a/chromium/third_party/libyuv/source/compare_common.cc b/chromium/third_party/libyuv/source/compare_common.cc
index ab587d08171..3e4c77a67fe 100644
--- a/chromium/third_party/libyuv/source/compare_common.cc
+++ b/chromium/third_party/libyuv/source/compare_common.cc
@@ -1,40 +1,40 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse = 0u;
-  for (int i = 0; i < count; ++i) {
-    int diff = src_a[i] - src_b[i];
-    sse += static_cast<uint32>(diff * diff);
-  }
-  return sse;
-}
-
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
-  uint32 hash = seed;
-  for (int i = 0; i < count; ++i) {
-    hash += (hash << 5) + src[i];
-  }
-  return hash;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse = 0u;
+  for (int i = 0; i < count; ++i) {
+    int diff = src_a[i] - src_b[i];
+    sse += static_cast<uint32>(diff * diff);
+  }
+  return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+  uint32 hash = seed;
+  for (int i = 0; i < count; ++i) {
+    hash += (hash << 5) + src[i];
+  }
+  return hash;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc
index a4e77750631..c377c163474 100644
--- a/chromium/third_party/libyuv/source/compare_neon.cc
+++ b/chromium/third_party/libyuv/source/compare_neon.cc
@@ -1,61 +1,61 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "vmov.u8    q8, #0                         \n"
-    "vmov.u8    q10, #0                        \n"
-    "vmov.u8    q9, #0                         \n"
-    "vmov.u8    q11, #0                        \n"
-
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld1.8     {q0}, [%0]!                    \n"
-    "vld1.8     {q1}, [%1]!                    \n"
-    "subs       %2, %2, #16                    \n"
-    "vsubl.u8   q2, d0, d2                     \n"
-    "vsubl.u8   q3, d1, d3                     \n"
-    "vmlal.s16  q8, d4, d4                     \n"
-    "vmlal.s16  q9, d6, d6                     \n"
-    "vmlal.s16  q10, d5, d5                    \n"
-    "vmlal.s16  q11, d7, d7                    \n"
-    "bgt        1b                             \n"
-
-    "vadd.u32   q8, q8, q9                     \n"
-    "vadd.u32   q10, q10, q11                  \n"
-    "vadd.u32   q11, q8, q10                   \n"
-    "vpaddl.u32 q1, q11                        \n"
-    "vadd.u64   d0, d2, d3                     \n"
-    "vmov.32    %3, d0[0]                      \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
-  return sse;
-}
-
-#endif  // __ARM_NEON__
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+  volatile uint32 sse;
+  asm volatile (
+    "vmov.u8    q8, #0                         \n"
+    "vmov.u8    q10, #0                        \n"
+    "vmov.u8    q9, #0                         \n"
+    "vmov.u8    q11, #0                        \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vld1.8     {q0}, [%0]!                    \n"
+    "vld1.8     {q1}, [%1]!                    \n"
+    "subs       %2, %2, #16                    \n"
+    "vsubl.u8   q2, d0, d2                     \n"
+    "vsubl.u8   q3, d1, d3                     \n"
+    "vmlal.s16  q8, d4, d4                     \n"
+    "vmlal.s16  q9, d6, d6                     \n"
+    "vmlal.s16  q10, d5, d5                    \n"
+    "vmlal.s16  q11, d7, d7                    \n"
+    "bgt        1b                             \n"
+
+    "vadd.u32   q8, q8, q9                     \n"
+    "vadd.u32   q10, q10, q11                  \n"
+    "vadd.u32   q11, q8, q10                   \n"
+    "vpaddl.u32 q1, q11                        \n"
+    "vadd.u64   d0, d2, d3                     \n"
+    "vmov.32    %3, d0[0]                      \n"
+    : "+r"(src_a),
+      "+r"(src_b),
+      "+r"(count),
+      "=r"(sse)
+    :
+    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+  return sse;
+}
+
+#endif  // __ARM_NEON__
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_posix.cc b/chromium/third_party/libyuv/source/compare_posix.cc
index f24835d7714..1e0ba8fe156 100644
--- a/chromium/third_party/libyuv/source/compare_posix.cc
+++ b/chromium/third_party/libyuv/source/compare_posix.cc
@@ -1,164 +1,166 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse;
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
-    "1:                                        \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "movdqa    (%0,%1,1),%%xmm2                \n"
-    "lea       0x10(%0),%0                     \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psubusb   %%xmm2,%%xmm1                   \n"
-    "psubusb   %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpckhbw %%xmm5,%%xmm2                   \n"
-    "pmaddwd   %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm2,%%xmm2                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "jg        1b                              \n"
-
-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,%3                       \n"
-
-  : "+r"(src_a),      // %0
-    "+r"(src_b),      // %1
-    "+r"(count),      // %2
-    "=g"(sse)         // %3
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-  return sse;
-}
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-#define HAS_HASHDJB2_SSE41
-CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-CONST uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
-};
-CONST uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
-};
-CONST uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
-};
-CONST uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
-};
-
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  uint32 hash;
-  asm volatile (
-    "movd      %2,%%xmm0                       \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    "movdqa    %4,%%xmm6                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm1                     \n"
-    "lea       0x10(%0),%0                     \n"
-    "pmulld    %%xmm6,%%xmm0                   \n"
-    "movdqa    %5,%%xmm5                       \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm7,%%xmm3                   \n"
-    "pmulld    %%xmm5,%%xmm3                   \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpckhwd %%xmm7,%%xmm4                   \n"
-    "pmulld    %%xmm5,%%xmm4                   \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "punpckhbw %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm7,%%xmm2                   \n"
-    "pmulld    %%xmm5,%%xmm2                   \n"
-    "movdqa    %8,%%xmm5                       \n"
-    "punpckhwd %%xmm7,%%xmm1                   \n"
-    "pmulld    %%xmm5,%%xmm1                   \n"
-    "paddd     %%xmm4,%%xmm3                   \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "sub       $0x10,%1                        \n"
-    "paddd     %%xmm3,%%xmm1                   \n"
-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "jg        1b                              \n"
-    "movd      %%xmm0,%3                       \n"
-  : "+r"(src),        // %0
-    "+r"(count),      // %1
-    "+rm"(seed),      // %2
-    "=g"(hash)        // %3
-  : "m"(kHash16x33),  // %4
-    "m"(kHashMul0),   // %5
-    "m"(kHashMul1),   // %6
-    "m"(kHashMul2),   // %7
-    "m"(kHashMul3)    // %8
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-  return hash;
-}
-#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#else
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#endif
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  uint32 sse;
+  asm volatile (  // NOLINT
+    "pxor      %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+    ".p2align  2                               \n"
+    "1:                                        \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10, 1) ",%1          \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psubusb   %%xmm2,%%xmm1                   \n"
+    "psubusb   %%xmm3,%%xmm2                   \n"
+    "por       %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpckhbw %%xmm5,%%xmm2                   \n"
+    "pmaddwd   %%xmm1,%%xmm1                   \n"
+    "pmaddwd   %%xmm2,%%xmm2                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "paddd     %%xmm2,%%xmm0                   \n"
+    "jg        1b                              \n"
+
+    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "movd      %%xmm0,%3                       \n"
+
+  : "+r"(src_a),      // %0
+    "+r"(src_b),      // %1
+    "+r"(count),      // %2
+    "=g"(sse)         // %3
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );  // NOLINT
+  return sse;
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  uint32 hash;
+  asm volatile (  // NOLINT
+    "movd      %2,%%xmm0                       \n"
+    "pxor      %%xmm7,%%xmm7                   \n"
+    "movdqa    %4,%%xmm6                       \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10, 0) ",%0          \n"
+    "pmulld    %%xmm6,%%xmm0                   \n"
+    "movdqa    %5,%%xmm5                       \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklbw %%xmm7,%%xmm2                   \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm7,%%xmm3                   \n"
+    "pmulld    %%xmm5,%%xmm3                   \n"
+    "movdqa    %6,%%xmm5                       \n"
+    "movdqa    %%xmm2,%%xmm4                   \n"
+    "punpckhwd %%xmm7,%%xmm4                   \n"
+    "pmulld    %%xmm5,%%xmm4                   \n"
+    "movdqa    %7,%%xmm5                       \n"
+    "punpckhbw %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "punpcklwd %%xmm7,%%xmm2                   \n"
+    "pmulld    %%xmm5,%%xmm2                   \n"
+    "movdqa    %8,%%xmm5                       \n"
+    "punpckhwd %%xmm7,%%xmm1                   \n"
+    "pmulld    %%xmm5,%%xmm1                   \n"
+    "paddd     %%xmm4,%%xmm3                   \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "sub       $0x10,%1                        \n"
+    "paddd     %%xmm3,%%xmm1                   \n"
+    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+    "paddd     %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm1,%%xmm0                   \n"
+    "jg        1b                              \n"
+    "movd      %%xmm0,%3                       \n"
+  : "+r"(src),        // %0
+    "+r"(count),      // %1
+    "+rm"(seed),      // %2
+    "=g"(hash)        // %3
+  : "m"(kHash16x33),  // %4
+    "m"(kHashMul0),   // %5
+    "m"(kHashMul1),   // %6
+    "m"(kHashMul2),   // %7
+    "m"(kHashMul3)    // %8
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );  // NOLINT
+  return hash;
+}
+#endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
diff --git a/chromium/third_party/libyuv/source/compare_win.cc b/chromium/third_party/libyuv/source/compare_win.cc
index e576e85c192..99831651f5f 100644
--- a/chromium/third_party/libyuv/source/compare_win.cc
+++ b/chromium/third_party/libyuv/source/compare_win.cc
@@ -1,192 +1,232 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
-    pxor       xmm0, xmm0
-    pxor       xmm5, xmm5
-    sub        edx, eax
-
-    align      16
-  wloop:
-    movdqa     xmm1, [eax]
-    movdqa     xmm2, [eax + edx]
-    lea        eax,  [eax + 16]
-    sub        ecx, 16
-    movdqa     xmm3, xmm1  // abs trick
-    psubusb    xmm1, xmm2
-    psubusb    xmm2, xmm3
-    por        xmm1, xmm2
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm1, xmm5
-    punpckhbw  xmm2, xmm5
-    pmaddwd    xmm1, xmm1
-    pmaddwd    xmm2, xmm2
-    paddd      xmm0, xmm1
-    paddd      xmm0, xmm2
-    jg         wloop
-
-    pshufd     xmm1, xmm0, 0xee
-    paddd      xmm0, xmm1
-    pshufd     xmm1, xmm0, 0x01
-    paddd      xmm0, xmm1
-    movd       eax, xmm0
-    ret
-  }
-}
-
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
-  __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
-    vpxor      ymm0, ymm0, ymm0  // sum
-    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
-    sub        edx, eax
-
-    align      16
-  wloop:
-    vmovdqu    ymm1, [eax]
-    vmovdqu    ymm2, [eax + edx]
-    lea        eax,  [eax + 32]
-    sub        ecx, 32
-    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
-    vpsubusb   ymm2, ymm2, ymm1
-    vpor       ymm1, ymm2, ymm3
-    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
-    vpunpckhbw ymm1, ymm1, ymm5
-    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
-    vpmaddwd   ymm1, ymm1, ymm1
-    vpaddd     ymm0, ymm0, ymm1
-    vpaddd     ymm0, ymm0, ymm2
-    jg         wloop
-
-    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
-    vpaddd     ymm0, ymm0, ymm1
-    vpermq     ymm1, ymm0, 0x02  // high + low lane.
-    vpaddd     ymm0, ymm0, ymm1
-    vmovd      eax, xmm0
-    vzeroupper
-    ret
-  }
-}
-#endif  // _MSC_VER >= 1700
-
-#define HAS_HASHDJB2_SSE41
-static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static const uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
-};
-
-// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
-// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
-// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
-// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
-// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
-    _asm _emit 0x40 _asm _emit reg
-
-__declspec(naked) __declspec(align(16))
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
-    movd       xmm0, [esp + 12]  // seed
-
-    pxor       xmm7, xmm7        // constant 0 for unpck
-    movdqa     xmm6, kHash16x33
-
-    align      16
-  wloop:
-    movdqu     xmm1, [eax]       // src[0-15]
-    lea        eax, [eax + 16]
-    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
-    movdqa     xmm5, kHashMul0
-    movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7        // src[0-7]
-    movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7        // src[0-3]
-    pmulld(0xdd)                 // pmulld     xmm3, xmm5
-    movdqa     xmm5, kHashMul1
-    movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7        // src[4-7]
-    pmulld(0xe5)                 // pmulld     xmm4, xmm5
-    movdqa     xmm5, kHashMul2
-    punpckhbw  xmm1, xmm7        // src[8-15]
-    movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7        // src[8-11]
-    pmulld(0xd5)                 // pmulld     xmm2, xmm5
-    movdqa     xmm5, kHashMul3
-    punpckhwd  xmm1, xmm7        // src[12-15]
-    pmulld(0xcd)                 // pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4        // add 16 results
-    paddd      xmm1, xmm2
-    sub        ecx, 16
-    paddd      xmm1, xmm3
-
-    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
-    paddd      xmm1, xmm2
-    pshufd     xmm2, xmm1, 0x01
-    paddd      xmm1, xmm2
-    paddd      xmm0, xmm1
-    jg         wloop
-
-    movd       eax, xmm0         // return hash
-    ret
-  }
-}
-#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
+/*
+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    pxor       xmm0, xmm0
+    pxor       xmm5, xmm5
+
+    align      4
+  wloop:
+    movdqa     xmm1, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm2, [edx]
+    lea        edx,  [edx + 16]
+    sub        ecx, 16
+    movdqa     xmm3, xmm1  // abs trick
+    psubusb    xmm1, xmm2
+    psubusb    xmm2, xmm3
+    por        xmm1, xmm2
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm1, xmm5
+    punpckhbw  xmm2, xmm5
+    pmaddwd    xmm1, xmm1
+    pmaddwd    xmm2, xmm2
+    paddd      xmm0, xmm1
+    paddd      xmm0, xmm2
+    jg         wloop
+
+    pshufd     xmm1, xmm0, 0xee
+    paddd      xmm0, xmm1
+    pshufd     xmm1, xmm0, 0x01
+    paddd      xmm0, xmm1
+    movd       eax, xmm0
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+  __asm {
+    mov        eax, [esp + 4]    // src_a
+    mov        edx, [esp + 8]    // src_b
+    mov        ecx, [esp + 12]   // count
+    vpxor      ymm0, ymm0, ymm0  // sum
+    vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
+    sub        edx, eax
+
+    align      4
+  wloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + edx]
+    lea        eax,  [eax + 32]
+    sub        ecx, 32
+    vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
+    vpsubusb   ymm2, ymm2, ymm1
+    vpor       ymm1, ymm2, ymm3
+    vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
+    vpunpckhbw ymm1, ymm1, ymm5
+    vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
+    vpmaddwd   ymm1, ymm1, ymm1
+    vpaddd     ymm0, ymm0, ymm1
+    vpaddd     ymm0, ymm0, ymm2
+    jg         wloop
+
+    vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
+    vpaddd     ymm0, ymm0, ymm1
+    vpermq     ymm1, ymm0, 0x02  // high + low lane.
+    vpaddd     ymm0, ymm0, ymm1
+    vmovd      eax, xmm0
+    vzeroupper
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+static uvec32 kHashMul0 = {
+  0x0c3525e1,  // 33 ^ 15
+  0xa3476dc1,  // 33 ^ 14
+  0x3b4039a1,  // 33 ^ 13
+  0x4f5f0981,  // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+  0x30f35d61,  // 33 ^ 11
+  0x855cb541,  // 33 ^ 10
+  0x040a9121,  // 33 ^ 9
+  0x747c7101,  // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+  0xec41d4e1,  // 33 ^ 7
+  0x4cfa3cc1,  // 33 ^ 6
+  0x025528a1,  // 33 ^ 5
+  0x00121881,  // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+  0x00008c61,  // 33 ^ 3
+  0x00000441,  // 33 ^ 2
+  0x00000021,  // 33 ^ 1
+  0x00000001,  // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
+// 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
+// 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
+// 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
+// 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+    _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+
+    pxor       xmm7, xmm7        // constant 0 for unpck
+    movdqa     xmm6, kHash16x33
+
+    align      4
+  wloop:
+    movdqu     xmm1, [eax]       // src[0-15]
+    lea        eax, [eax + 16]
+    pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
+    movdqa     xmm5, kHashMul0
+    movdqa     xmm2, xmm1
+    punpcklbw  xmm2, xmm7        // src[0-7]
+    movdqa     xmm3, xmm2
+    punpcklwd  xmm3, xmm7        // src[0-3]
+    pmulld(0xdd)                 // pmulld     xmm3, xmm5
+    movdqa     xmm5, kHashMul1
+    movdqa     xmm4, xmm2
+    punpckhwd  xmm4, xmm7        // src[4-7]
+    pmulld(0xe5)                 // pmulld     xmm4, xmm5
+    movdqa     xmm5, kHashMul2
+    punpckhbw  xmm1, xmm7        // src[8-15]
+    movdqa     xmm2, xmm1
+    punpcklwd  xmm2, xmm7        // src[8-11]
+    pmulld(0xd5)                 // pmulld     xmm2, xmm5
+    movdqa     xmm5, kHashMul3
+    punpckhwd  xmm1, xmm7        // src[12-15]
+    pmulld(0xcd)                 // pmulld     xmm1, xmm5
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    sub        ecx, 16
+    paddd      xmm1, xmm3
+
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+  __asm {
+    mov        eax, [esp + 4]    // src
+    mov        ecx, [esp + 8]    // count
+    movd       xmm0, [esp + 12]  // seed
+    movdqa     xmm6, kHash16x33
+
+    align      4
+  wloop:
+    vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
+    vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
+    pmulld     xmm3, kHashMul0
+    vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
+    pmulld     xmm4, kHashMul1
+    vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
+    pmulld     xmm2, kHashMul2
+    lea        eax, [eax + 16]
+    pmulld     xmm1, kHashMul3
+    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm1, xmm2
+    sub        ecx, 16
+    paddd      xmm1, xmm3
+    pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
+    paddd      xmm1, xmm2
+    pshufd     xmm2, xmm1, 0x01
+    paddd      xmm1, xmm2
+    paddd      xmm0, xmm1
+    jg         wloop
+
+    movd       eax, xmm0         // return hash
+    ret
+  }
+}
+#endif  // _MSC_VER >= 1700
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert.cc b/chromium/third_party/libyuv/source/convert.cc
index 980df7edd5e..9ec71058ce9 100644
--- a/chromium/third_party/libyuv/source/convert.cc
+++ b/chromium/third_party/libyuv/source/convert.cc
@@ -22,7 +22,43 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int src_uv_width, int src_uv_height) {
+  if (src_y_width == 0 || src_y_height == 0 ||
+      src_uv_width == 0 || src_uv_height == 0) {
+    return -1;
+  }
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+  const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
 // Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
 LIBYUV_API
 int I420Copy(const uint8* src_y, int src_stride_y,
              const uint8* src_u, int src_stride_u,
@@ -39,7 +75,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    int halfheight = (height + 1) >> 1;
+    const int halfheight = (height + 1) >> 1;
     src_y = src_y + (height - 1) * src_stride_y;
     src_u = src_u + (halfheight - 1) * src_stride_u;
     src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -48,16 +84,19 @@ int I420Copy(const uint8* src_y, int src_stride_y,
     src_stride_v = -src_stride_v;
   }
 
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
+  // Copy UV planes.
+  const int halfwidth = (width + 1) >> 1;
+  const int halfheight = (height + 1) >> 1;
   CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
   CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
   return 0;
 }
 
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
 int I422ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -66,88 +105,19 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int halfwidth = (width + 1) >> 1;
-  void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
-                  uint8* dst_uv, int pix) = HalfRow_C;
-#if defined(HAS_HALFROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    HalfRow = HalfRow_SSE2;
-  }
-#endif
-#if defined(HAS_HALFROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
-    HalfRow = HalfRow_AVX2;
-  }
-#endif
-#if defined(HAS_HALFROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
-    HalfRow = HalfRow_NEON;
-  }
-#endif
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // SubSample U plane.
-  int y;
-  for (y = 0; y < height - 1; y += 2) {
-    HalfRow(src_u, src_stride_u, dst_u, halfwidth);
-    src_u += src_stride_u * 2;
-    dst_u += dst_stride_u;
-  }
-  if (height & 1) {
-    HalfRow(src_u, 0, dst_u, halfwidth);
-  }
-
-  // SubSample V plane.
-  for (y = 0; y < height - 1; y += 2) {
-    HalfRow(src_v, src_stride_v, dst_v, halfwidth);
-    src_v += src_stride_v * 2;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    HalfRow(src_v, 0, dst_v, halfwidth);
-  }
-  return 0;
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
 }
 
-// Blends 32x2 pixels to 16x1
-// source in scale.cc
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-#elif !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-#endif
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
 int I444ToI420(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -156,69 +126,16 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int halfwidth = (width + 1) >> 1;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
-#if defined(HAS_SCALEROWDOWN2_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(halfwidth, 16)) {
-    ScaleRowDown2 = ScaleRowDown2Box_NEON;
-  }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(halfwidth, 16) &&
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    ScaleRowDown2 = ScaleRowDown2Box_SSE2;
-  }
-#endif
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // SubSample U plane.
-  int y;
-  for (y = 0; y < height - 1; y += 2) {
-    ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth);
-    src_u += src_stride_u * 2;
-    dst_u += dst_stride_u;
-  }
-  if (height & 1) {
-    ScaleRowDown2(src_u, 0, dst_u, halfwidth);
-  }
-
-  // SubSample V plane.
-  for (y = 0; y < height - 1; y += 2) {
-    ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth);
-    src_v += src_stride_v * 2;
-    dst_v += dst_stride_v;
-  }
-  if (height & 1) {
-    ScaleRowDown2(src_v, 0, dst_v, halfwidth);
-  }
-  return 0;
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    width, height);
 }
 
-// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
 // 411 chroma is 1/4 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
@@ -229,41 +146,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  int quarterwidth = (width + 3) >> 2;
-
-  // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height.
-  ScalePlane(src_u, src_stride_u, quarterwidth, height,
-             dst_u, dst_stride_u, halfwidth, halfheight,
-             kFilterNone);
-
-  // Resample V plane.
-  ScalePlane(src_v, src_stride_v, quarterwidth, height,
-             dst_v, dst_stride_v, halfwidth, halfheight,
-             kFilterNone);
-  return 0;
+  const int src_uv_width = SUBSAMPLE(width, 3, 2);
+  return I4xxToI420(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    src_uv_width, height);
 }
 
 // I400 is greyscale typically used in MJPG
@@ -309,7 +200,6 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
   }
 #endif
 #if defined(HAS_COPYROW_ERMS)
-  // TODO(fbarchard): Detect Fast String support.
   if (TestCpuFlag(kCpuHasERMS)) {
     CopyRow = CopyRow_ERMS;
   }
@@ -369,20 +259,23 @@ static int X420ToI420(const uint8* src_y,
     dst_stride_u = -dst_stride_u;
     dst_stride_v = -dst_stride_v;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if (src_stride_y0 == width &&
       src_stride_y1 == width &&
       dst_stride_y == width) {
-    width = width * height;
+    width *= height;
     height = 1;
+    src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
   }
-  if (src_stride_uv == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
-    halfwidth = halfwidth * halfheight;
+  // Coalesce rows.
+  if (src_stride_uv == halfwidth * 2 &&
+      dst_stride_u == halfwidth &&
+      dst_stride_v == halfwidth) {
+    halfwidth *= halfheight;
     halfheight = 1;
+    src_stride_uv = dst_stride_u = dst_stride_v = 0;
   }
   void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
       SplitUVRow_C;
@@ -782,7 +675,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -798,7 +691,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
     ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
@@ -1044,7 +937,7 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
 }
 
 // Convert RGB24 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
                 uint8* dst_y, int dst_stride_y,
                 uint8* dst_u, int dst_stride_u,
@@ -1147,7 +1040,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
 }
 
 // Convert RAW to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int RAWToI420(const uint8* src_raw, int src_stride_raw,
               uint8* dst_y, int dst_stride_y,
               uint8* dst_u, int dst_stride_u,
@@ -1250,7 +1143,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
 }
 
 // Convert RGB565 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
                 uint8* dst_y, int dst_stride_y,
                 uint8* dst_u, int dst_stride_u,
@@ -1353,7 +1246,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
 }
 
 // Convert ARGB1555 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
                    uint8* dst_y, int dst_stride_y,
                    uint8* dst_u, int dst_stride_u,
@@ -1458,7 +1351,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
 }
 
 // Convert ARGB4444 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
                    uint8* dst_y, int dst_stride_y,
                    uint8* dst_u, int dst_stride_u,
diff --git a/chromium/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libyuv/source/convert_argb.cc
index 55d4d6904ce..0a503361d8b 100644
--- a/chromium/third_party/libyuv/source/convert_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_argb.cc
@@ -63,16 +63,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u == width &&
       src_stride_v == width &&
       dst_stride_argb == width * 4) {
-    return I444ToARGB(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_argb, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
   void (*I444ToARGBRow)(const uint8* y_buf,
                         const uint8* u_buf,
@@ -126,16 +124,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u * 2 == width &&
       src_stride_v * 2 == width &&
       dst_stride_argb == width * 4) {
-    return I422ToARGB(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_argb, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
   void (*I422ToARGBRow)(const uint8* y_buf,
                         const uint8* u_buf,
@@ -207,16 +203,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u * 4 == width &&
       src_stride_v * 4 == width &&
       dst_stride_argb == width * 4) {
-    return I411ToARGB(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_argb, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
   void (*I411ToARGBRow)(const uint8* y_buf,
                         const uint8* u_buf,
@@ -267,12 +261,12 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       dst_stride_argb == width * 4) {
-    return I400ToARGB_Reference(src_y, 0,
-                                dst_argb, 0,
-                                width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
   }
   void (*YToARGBRow)(const uint8* y_buf,
                      uint8* rgb_buf,
@@ -317,12 +311,12 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     src_y = src_y + (height - 1) * src_stride_y;
     src_stride_y = -src_stride_y;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       dst_stride_argb == width * 4) {
-    return I400ToARGB(src_y, 0,
-                      dst_argb, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
   }
   void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
       I400ToARGBRow_C;
@@ -353,17 +347,17 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
 }
 
 // Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
+static uvec8 kShuffleMaskBGRAToARGB = {
   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
 };
 
 // Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
+static uvec8 kShuffleMaskABGRToARGB = {
   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
 };
 
 // Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
+static uvec8 kShuffleMaskRGBAToARGB = {
   1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
 };
 
@@ -415,12 +409,12 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
     src_stride_rgb24 = -src_stride_rgb24;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_rgb24 == width * 3 &&
       dst_stride_argb == width * 4) {
-    return RGB24ToARGB(src_rgb24, 0,
-                       dst_argb, 0,
-                       width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_rgb24 = dst_stride_argb = 0;
   }
   void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
       RGB24ToARGBRow_C;
@@ -464,12 +458,12 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     src_raw = src_raw + (height - 1) * src_stride_raw;
     src_stride_raw = -src_stride_raw;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_raw == width * 3 &&
       dst_stride_argb == width * 4) {
-    return RAWToARGB(src_raw, 0,
-                     dst_argb, 0,
-                     width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_raw = dst_stride_argb = 0;
   }
   void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
       RAWToARGBRow_C;
@@ -513,12 +507,12 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
     src_stride_rgb565 = -src_stride_rgb565;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_rgb565 == width * 2 &&
       dst_stride_argb == width * 4) {
-    return RGB565ToARGB(src_rgb565, 0,
-                        dst_argb, 0,
-                        width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_rgb565 = dst_stride_argb = 0;
   }
   void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
       RGB565ToARGBRow_C;
@@ -562,12 +556,12 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
     src_stride_argb1555 = -src_stride_argb1555;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb1555 == width * 2 &&
       dst_stride_argb == width * 4) {
-    return ARGB1555ToARGB(src_argb1555, 0,
-                          dst_argb, 0,
-                          width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb1555 = dst_stride_argb = 0;
   }
   void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) = ARGB1555ToARGBRow_C;
@@ -611,12 +605,12 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
     src_stride_argb4444 = -src_stride_argb4444;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb4444 == width * 2 &&
       dst_stride_argb == width * 4) {
-    return ARGB4444ToARGB(src_argb4444, 0,
-                          dst_argb, 0,
-                          width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb4444 = dst_stride_argb = 0;
   }
   void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) = ARGB4444ToARGBRow_C;
@@ -812,13 +806,13 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (width * height <= kMaxStride &&
       src_stride_yuy2 == width * 2 &&
       dst_stride_argb == width * 4) {
-    return YUY2ToARGB(src_yuy2, 0,
-                      dst_argb, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_argb = 0;
   }
   void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
       YUY2ToARGBRow_C;
@@ -865,13 +859,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
     src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (width * height <= kMaxStride &&
       src_stride_uyvy == width * 2 &&
       dst_stride_argb == width * 4) {
-    return UYVYToARGB(src_uyvy, 0,
-                      dst_argb, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_argb = 0;
   }
   void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
       UYVYToARGBRow_C;
diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc
index 87f9b5cb726..dc708de5e0b 100644
--- a/chromium/third_party/libyuv/source/convert_from.cc
+++ b/chromium/third_party/libyuv/source/convert_from.cc
@@ -25,6 +25,42 @@ namespace libyuv {
 extern "C" {
 #endif
 
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+                      const uint8* src_u, int src_stride_u,
+                      const uint8* src_v, int src_stride_v,
+                      uint8* dst_y, int dst_stride_y,
+                      uint8* dst_u, int dst_stride_u,
+                      uint8* dst_v, int dst_stride_v,
+                      int src_y_width, int src_y_height,
+                      int dst_uv_width, int dst_uv_height) {
+  if (src_y_width == 0 || src_y_height == 0 ||
+      dst_uv_width <= 0 || dst_uv_height <= 0) {
+    return -1;
+  }
+  const int dst_y_width = Abs(src_y_width);
+  const int dst_y_height = Abs(src_y_height);
+  const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+  const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+  ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+             dst_y, dst_stride_y, dst_y_width, dst_y_height,
+             kFilterBilinear);
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+             kFilterBilinear);
+  return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
 LIBYUV_API
 int I420ToI422(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -33,84 +69,20 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-  int halfwidth = (width + 1) >> 1;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_X86)
-  if (IS_ALIGNED(halfwidth, 4)) {
-    CopyRow = CopyRow_X86;
-  }
-#endif
-#if defined(HAS_COPYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
-      IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
-      IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
-      IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
-      IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
-    CopyRow = CopyRow_SSE2;
-  }
-#endif
-#if defined(HAS_COPYROW_ERMS)
-  if (TestCpuFlag(kCpuHasERMS)) {
-    CopyRow = CopyRow_ERMS;
-  }
-#endif
-#if defined(HAS_COPYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) {
-    CopyRow = CopyRow_NEON;
-  }
-#endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  // UpSample U plane.
-  int y;
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src_u, dst_u, halfwidth);
-    CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
-    src_u += src_stride_u;
-    dst_u += dst_stride_u * 2;
-  }
-  if (height & 1) {
-    CopyRow(src_u, dst_u, halfwidth);
-  }
-
-  // UpSample V plane.
-  for (y = 0; y < height - 1; y += 2) {
-    CopyRow(src_v, dst_v, halfwidth);
-    CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
-    src_v += src_stride_v;
-    dst_v += dst_stride_v * 2;
-  }
-  if (height & 1) {
-    CopyRow(src_v, dst_v, halfwidth);
-  }
-  return 0;
+  const int dst_uv_width = (Abs(width) + 1) >> 1;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
 }
 
-// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
 LIBYUV_API
 int I420ToI444(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -119,40 +91,16 @@ int I420ToI444(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u|| !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-
-  // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height.
-  ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
-             dst_u, dst_stride_u, width, height,
-             kFilterNone);
-
-  // Upsample V plane.
-  ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
-             dst_v, dst_stride_v, width, height,
-             kFilterNone);
-  return 0;
+  const int dst_uv_width = Abs(width);
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
 }
 
 // 420 chroma is 1/2 width, 1/2 height
@@ -165,41 +113,16 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
                uint8* dst_u, int dst_stride_u,
                uint8* dst_v, int dst_stride_v,
                int width, int height) {
-  if (!src_y || !src_u || !src_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    dst_y = dst_y + (height - 1) * dst_stride_y;
-    dst_u = dst_u + (height - 1) * dst_stride_u;
-    dst_v = dst_v + (height - 1) * dst_stride_v;
-    dst_stride_y = -dst_stride_y;
-    dst_stride_u = -dst_stride_u;
-    dst_stride_v = -dst_stride_v;
-  }
-
-  // Copy Y plane
-  if (dst_y) {
-    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
-  }
-
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  int quarterwidth = (width + 3) >> 2;
-
-  // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height
-  ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
-             dst_u, dst_stride_u, quarterwidth, height,
-             kFilterNone);
-
-  // Resample V plane.
-  ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
-             dst_v, dst_stride_v, quarterwidth, height,
-             kFilterNone);
-  return 0;
+  const int dst_uv_width = (Abs(width) + 3) >> 2;
+  const int dst_uv_height = Abs(height);
+  return I420ToI4xx(src_y, src_stride_y,
+                    src_u, src_stride_u,
+                    src_v, src_stride_v,
+                    dst_y, dst_stride_y,
+                    dst_u, dst_stride_u,
+                    dst_v, dst_stride_v,
+                    width, height,
+                    dst_uv_width, dst_uv_height);
 }
 
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
@@ -237,16 +160,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u * 2 == width &&
       src_stride_v * 2 == width &&
       dst_stride_yuy2 == width * 2) {
-    return I422ToYUY2(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_yuy2, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
   }
   void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
                         const uint8* src_v, uint8* dst_yuy2, int width) =
@@ -343,16 +264,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
     dst_stride_uyvy = -dst_stride_uyvy;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u * 2 == width &&
       src_stride_v * 2 == width &&
       dst_stride_uyvy == width * 2) {
-    return I422ToUYVY(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_uyvy, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
   }
   void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
                         const uint8* src_v, uint8* dst_uyvy, int width) =
@@ -453,19 +372,22 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
     dst_stride_y = -dst_stride_y;
     dst_stride_uv = -dst_stride_uv;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
   if (src_stride_y == width &&
       dst_stride_y == width) {
-    width = width * height;
+    width *= height;
     height = 1;
+    src_stride_y = dst_stride_y = 0;
   }
-  if (src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_uv == width) {
-    halfwidth = halfwidth * halfheight;
+  // Coalesce rows.
+  if (src_stride_u == halfwidth &&
+      src_stride_v == halfwidth &&
+      dst_stride_uv == halfwidth * 2) {
+    halfwidth *= halfheight;
     halfheight = 1;
+    src_stride_u = src_stride_v = dst_stride_uv = 0;
   }
   void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                       int width) = MergeUVRow_C;
diff --git a/chromium/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libyuv/source/convert_from_argb.cc
index 418f44d0cf5..9d5752cbb09 100644
--- a/chromium/third_party/libyuv/source/convert_from_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_from_argb.cc
@@ -36,32 +36,30 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_y == width &&
       dst_stride_u == width &&
       dst_stride_v == width) {
-    return ARGBToI444(src_argb, 0,
-                      dst_y, 0,
-                      dst_u, 0,
-                      dst_v, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
   void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) = ARGBToUV444Row_C;
-#if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
-        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-          ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-        }
-      }
-  }
-#endif
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+    if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+      if (IS_ALIGNED(width, 16)) {
+        ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
+        if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+          ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+        }
+      }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -111,16 +109,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_y == width &&
       dst_stride_u * 2 == width &&
       dst_stride_v * 2 == width) {
-    return ARGBToI422(src_argb, 0,
-                      dst_y, 0,
-                      dst_u, 0,
-                      dst_v, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
   void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) = ARGBToUV422Row_C;
@@ -190,16 +186,14 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_y == width &&
       dst_stride_u * 4 == width &&
       dst_stride_v * 4 == width) {
-    return ARGBToI411(src_argb, 0,
-                      dst_y, 0,
-                      dst_u, 0,
-                      dst_v, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
   void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) = ARGBToUV411Row_C;
@@ -251,7 +245,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_uv, int dst_stride_uv,
@@ -272,7 +266,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -353,7 +347,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
 }
 
 // Same as NV12 but U and V swapped.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
                uint8* dst_y, int dst_stride_y,
                uint8* dst_uv, int dst_stride_uv,
@@ -374,7 +368,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -455,7 +449,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
 }
 
 // Convert ARGB to YUY2.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
                uint8* dst_yuy2, int dst_stride_yuy2,
                int width, int height) {
@@ -470,13 +464,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (width * height <= kMaxStride &&
       src_stride_argb == width * 4 &&
       dst_stride_yuy2 == width * 2) {
-    return ARGBToYUY2(src_argb, 0,
-                      dst_yuy2, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yuy2 = 0;
   }
   void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) = ARGBToUV422Row_C;
@@ -551,7 +545,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
 }
 
 // Convert ARGB to UYVY.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
                uint8* dst_uyvy, int dst_stride_uyvy,
                int width, int height) {
@@ -566,13 +560,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
     dst_stride_uyvy = -dst_stride_uyvy;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (width * height <= kMaxStride &&
       src_stride_argb == width * 4 &&
       dst_stride_uyvy == width * 2) {
-    return ARGBToUYVY(src_argb, 0,
-                      dst_uyvy, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_uyvy = 0;
   }
   void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
                          int pix) = ARGBToUV422Row_C;
@@ -659,12 +653,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_y == width) {
-    return ARGBToI400(src_argb, 0,
-                      dst_y, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_y = 0;
   }
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
@@ -706,7 +700,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
 }
 
 // Shuffle table for converting ARGB to RGBA.
-static const uvec8 kShuffleMaskARGBToRGBA = {
+static uvec8 kShuffleMaskARGBToRGBA = {
   3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
 };
 
@@ -734,19 +728,17 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_rgb24 == width * 3) {
-    return ARGBToRGB24(src_argb, 0,
-                       dst_rgb24, 0,
-                       width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb24 = 0;
   }
   void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
       ARGBToRGB24Row_C;
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
@@ -782,19 +774,17 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_raw == width * 3) {
-    return ARGBToRAW(src_argb, 0,
-                     dst_raw, 0,
-                     width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_raw = 0;
   }
   void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
       ARGBToRAWRow_C;
 #if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
     if (IS_ALIGNED(width, 16)) {
       ARGBToRAWRow = ARGBToRAWRow_SSSE3;
@@ -830,12 +820,12 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_rgb565 == width * 2) {
-    return ARGBToRGB565(src_argb, 0,
-                        dst_rgb565, 0,
-                        width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_rgb565 = 0;
   }
   void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
       ARGBToRGB565Row_C;
@@ -877,12 +867,12 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_argb1555 == width * 2) {
-    return ARGBToARGB1555(src_argb, 0,
-                          dst_argb1555, 0,
-                          width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb1555 = 0;
   }
   void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
       ARGBToARGB1555Row_C;
@@ -924,12 +914,12 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_argb4444 == width * 2) {
-    return ARGBToARGB4444(src_argb, 0,
-                          dst_argb4444, 0,
-                          width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb4444 = 0;
   }
   void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
       ARGBToARGB4444Row_C;
@@ -980,7 +970,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
   void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
       ARGBToYJRow_C;
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
     ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@@ -996,7 +986,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
     ARGBToYJRow = ARGBToYJRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -1048,12 +1038,12 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_yj == width) {
-    return ARGBToJ400(src_argb, 0,
-                      dst_yj, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_yj = 0;
   }
   void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
       ARGBToYJRow_C;
diff --git a/chromium/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libyuv/source/convert_to_argb.cc
index 95b6386d719..aa6185661cd 100644
--- a/chromium/third_party/libyuv/source/convert_to_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_to_argb.cc
@@ -61,15 +61,15 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
   bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample;
   uint8* tmp_argb = dst_argb;
   int tmp_argb_stride = argb_stride;
-  uint8* buf = NULL;
+  uint8* rotate_buffer = NULL;
   int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
   if (need_buf) {
     int argb_size = dst_width * abs_dst_height * 4;
-    buf = new uint8[argb_size];
-    if (!buf) {
+    rotate_buffer = new uint8[argb_size];
+    if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    dst_argb = buf;
+    dst_argb = rotate_buffer;
     argb_stride = dst_width;
   }
 
@@ -312,7 +312,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
                      tmp_argb, tmp_argb_stride,
                      dst_width, abs_dst_height, rotation);
     }
-    delete buf;
+    delete [] rotate_buffer;
   }
 
   return r;
diff --git a/chromium/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libyuv/source/convert_to_i420.cc
index 763eb50920e..5683ffe43ab 100644
--- a/chromium/third_party/libyuv/source/convert_to_i420.cc
+++ b/chromium/third_party/libyuv/source/convert_to_i420.cc
@@ -68,16 +68,16 @@ int ConvertToI420(const uint8* sample,
   int tmp_y_stride = y_stride;
   int tmp_u_stride = u_stride;
   int tmp_v_stride = v_stride;
-  uint8* buf = NULL;
+  uint8* rotate_buffer = NULL;
   int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
   if (need_buf) {
     int y_size = dst_width * abs_dst_height;
     int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2);
-    buf = new uint8[y_size + uv_size * 2];
-    if (!buf) {
+    rotate_buffer = new uint8[y_size + uv_size * 2];
+    if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    y = buf;
+    y = rotate_buffer;
     u = y + y_size;
     v = u + uv_size;
     y_stride = dst_width;
@@ -372,7 +372,7 @@ int ConvertToI420(const uint8* sample,
                      tmp_v, tmp_v_stride,
                      dst_width, abs_dst_height, rotation);
     }
-    delete buf;
+    delete [] rotate_buffer;
   }
 
   return r;
diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc
index b4c993a2740..c4f840abb14 100644
--- a/chromium/third_party/libyuv/source/cpu_id.cc
+++ b/chromium/third_party/libyuv/source/cpu_id.cc
@@ -11,14 +11,16 @@
 #include "libyuv/cpu_id.h"
 
 #ifdef _MSC_VER
-#include <intrin.h>  // For __cpuid()
+#include <intrin.h>  // For __cpuidex()
 #endif
-#if !defined(__CLR_VER) && defined(_M_X64) && \
+#if !defined(__CLR_VER) && !defined(__native_client__) && defined(_M_X64) && \
     defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
+#if !defined(__native_client__)
 #include <stdlib.h>  // For getenv()
+#endif
 
 // For ArmCpuCaps() but unittested on all platforms
 #include <stdio.h>
@@ -26,92 +28,102 @@
 
 #include "libyuv/basic_types.h"  // For CPU_X86
 
-// TODO(fbarchard): Consider cpu functionality for breakpoints, timer and cache.
-// arm - bkpt vs intel int 3
-
-// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
-#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (  // NOLINT
-    "mov %%ebx, %%edi                          \n"
-    "cpuid                                     \n"
-    "xchg %%edi, %%ebx                         \n"
-    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
-  asm volatile (  // NOLINT
-    "cpuid                                     \n"
-    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
-    : "a"(info_type));
-}
-#endif
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// For functions that use rowbuffer and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
 // Low level cpuid for X86. Returns zeros on other CPUs.
 #if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
     defined(__i386__) || defined(__x86_64__))
 LIBYUV_API
-void CpuId(int cpu_info[4], int info_type) {
-  __cpuid(cpu_info, info_type);
-}
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if defined(_MSC_VER)
+#if (_MSC_FULL_VER >= 160040219)
+  __cpuidex(reinterpret_cast<int*>(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+  __asm {
+    mov        eax, info_eax
+    mov        ecx, info_ecx
+    mov        edi, cpu_info
+    cpuid
+    mov        [edi], eax
+    mov        [edi + 4], ebx
+    mov        [edi + 8], ecx
+    mov        [edi + 12], edx
+  }
 #else
-LIBYUV_API
-void CpuId(int cpu_info[4], int) {
-  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-}
+  if (info_ecx == 0) {
+    __cpuid(reinterpret_cast<int*>(cpu_info), info_eax);
+  } else {
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+  }
 #endif
-
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-#if !defined(__CLR_VER) && defined(_M_X64) && \
-    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
-  return static_cast<uint32>(_xgetbv(xcr));
+#else  // defined(_MSC_VER)
+  uint32 info_ebx, info_edx;
+  asm volatile (  // NOLINT
+#if defined( __i386__) && defined(__PIC__)
+    // Preserve ebx for fpic 32 bit.
+    "mov %%ebx, %%edi                          \n"
+    "cpuid                                     \n"
+    "xchg %%edi, %%ebx                         \n"
+    : "=D" (info_ebx),
+#else
+    "cpuid                                     \n"
+    : "=b" (info_ebx),
+#endif  //  defined( __i386__) && defined(__PIC__)
+      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+  cpu_info[0] = info_eax;
+  cpu_info[1] = info_ebx;
+  cpu_info[2] = info_ecx;
+  cpu_info[3] = info_edx;
+#endif  // defined(_MSC_VER)
 }
-#elif !defined(__CLR_VER) && defined(_M_IX86) && defined(_MSC_VER)
+
+#if !defined(__native_client__)
 #define HAS_XGETBV
-__declspec(naked) __declspec(align(16))
-static uint32 XGetBV(unsigned int xcr) {
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int TestOsSaveYmm() {
+  uint32 xcr0 = 0u;
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+  xcr0 = static_cast<uint32>(_xgetbv(0));  // VS2010 SP1 required.
+#elif defined(_M_IX86)
   __asm {
-    mov        ecx, [esp + 4]    // xcr
-    push       edx
-    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // xgetbv for vs2005.
-    pop        edx
-    ret
+    xor        ecx, ecx    // xcr 0
+    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.
+    mov        xcr0, eax
   }
-}
 #elif defined(__i386__) || defined(__x86_64__)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
-  uint32 xcr_feature_mask;
-  asm volatile (  // NOLINT
-    ".byte 0x0f, 0x01, 0xd0\n"
-    : "=a"(xcr_feature_mask)
-    : "c"(xcr)
-    : "memory", "cc", "edx");  // edx unused.
-  return xcr_feature_mask;
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif  // defined(_MSC_VER)
+  return((xcr0 & 6) == 6);  // Is ymm saved?
+}
+#endif  // !defined(__native_client__)
+#else
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
-#endif
-#ifdef HAS_XGETBV
-static const int kXCR_XFEATURE_ENABLED_MASK = 0;
 #endif
 
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int ArmCpuCaps(const char* cpuinfo_name) {
   FILE* f = fopen(cpuinfo_name, "r");
   if (f) {
-    char buf[512];
-    while (fgets(buf, 511, f)) {
-      if (memcmp(buf, "Features", 8) == 0) {
-        char* p = strstr(buf, " neon");
+    char cpuinfo_line[512];
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+      if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+        char* p = strstr(cpuinfo_line, " neon");
         if (p && (p[5] == ' ' || p[5] == '\n')) {
           fclose(f);
           return kCpuHasNEON;
@@ -129,7 +141,7 @@ static int MipsCpuCaps(const char* search_string) {
   char cpuinfo_line[256];
   FILE* f = NULL;
   if ((f = fopen(file_name, "r")) != NULL) {
-    while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+    while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
       if (strstr(cpuinfo_line, search_string) != NULL) {
         fclose(f);
         return kCpuHasMIPS_DSP;
@@ -148,6 +160,8 @@ int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
 
 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
 static bool TestEnv(const char* name) {
   const char* var = getenv(name);
   if (var) {
@@ -157,23 +171,29 @@ static bool TestEnv(const char* name) {
   }
   return false;
 }
+#else  // nacl does not support getenv().
+static bool TestEnv(const char*) {
+  return false;
+}
+#endif
 
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int InitCpuFlags(void) {
 #if !defined(__CLR_VER) && defined(CPU_X86)
-  int cpu_info1[4] = { 0, 0, 0, 0 };
-  int cpu_info7[4] = { 0, 0, 0, 0 };
-  __cpuid(cpu_info1, 1);
-  __cpuid(cpu_info7, 7);
+  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+  CpuId(1, 0, cpu_info1);
+  CpuId(7, 0, cpu_info7);
   cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
               ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
               ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
               ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
               ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
               kCpuHasX86;
 #ifdef HAS_XGETBV
   if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave
-      (XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06) {  // Saves YMM.
+      TestOsSaveYmm()) {  // Saves YMM.
     cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
                  kCpuHasAVX;
   }
@@ -203,6 +223,9 @@ int InitCpuFlags(void) {
   if (TestEnv("LIBYUV_DISABLE_ERMS")) {
     cpu_info_ &= ~kCpuHasERMS;
   }
+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+    cpu_info_ &= ~kCpuHasFMA3;
+  }
 #elif defined(__mips__) && defined(__linux__)
   // Linux mips parse text file for dsp detect.
   cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.
@@ -221,10 +244,11 @@ int InitCpuFlags(void) {
     cpu_info_ &= ~kCpuHasMIPS_DSPR2;
   }
 #elif defined(__arm__)
-#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \
+    !defined(__native_client__)
   // Linux arm parse text file for neon detect.
   cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
-#elif defined(__ARM_NEON__)
+#elif defined(__ARM_NEON__) || defined(__native_client__)
   // gcc -mfpu=neon defines __ARM_NEON__
   // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
   // to disable Neon on devices that do not have it.
diff --git a/chromium/third_party/libyuv/source/format_conversion.cc b/chromium/third_party/libyuv/source/format_conversion.cc
index 5b931b58773..cf7d6ea3af9 100644
--- a/chromium/third_party/libyuv/source/format_conversion.cc
+++ b/chromium/third_party/libyuv/source/format_conversion.cc
@@ -32,7 +32,7 @@ static int MakeSelectors(const int blue_index,
                          const int green_index,
                          const int red_index,
                          uint32 dst_fourcc_bayer,
-                         uint32 *index_map) {
+                         uint32* index_map) {
   // Now build a lookup table containing the indices for the four pixels in each
   // 2x2 Bayer grid.
   switch (dst_fourcc_bayer) {
@@ -280,7 +280,7 @@ int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
 }
 
 // Converts any Bayer RGB format to ARGB.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
                 uint8* dst_y, int dst_stride_y,
                 uint8* dst_u, int dst_stride_u,
@@ -310,7 +310,7 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
   void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
       ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -380,7 +380,7 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
 }
 
 // Convert I420 to Bayer.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 int I420ToBayer(const uint8* src_y, int src_stride_y,
                 const uint8* src_u, int src_stride_u,
                 const uint8* src_v, int src_stride_v,
diff --git a/chromium/third_party/libyuv/source/mjpeg_decoder.cc b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
index 5d7296d7e73..bd423200531 100644
--- a/chromium/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
@@ -420,9 +420,12 @@ void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
   // recover from errors we use setjmp() as shown in their example. setjmp() is
   // C's implementation for the "call with current continuation" functionality
   // seen in some functional programming languages.
+  // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
   char buf[JMSG_LENGTH_MAX];
   (*cinfo->err->format_message)(cinfo, buf);
   // ERROR: Error in jpeglib: buf
+#endif
 
   SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
   // This rewinds the call stack to the point of the corresponding setjmp()
diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc
index 2f70331327c..114faaef627 100644
--- a/chromium/third_party/libyuv/source/planar_functions.cc
+++ b/chromium/third_party/libyuv/source/planar_functions.cc
@@ -28,13 +28,12 @@ LIBYUV_API
 void CopyPlane(const uint8* src_y, int src_stride_y,
                uint8* dst_y, int dst_stride_y,
                int width, int height) {
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       dst_stride_y == width) {
-    CopyPlane(src_y, 0,
-              dst_y, 0,
-              width * height, 1);
-    return;
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
   }
   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_X86)
@@ -173,10 +172,16 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Mirror a plane of data
+// Mirror a plane of data.
 void MirrorPlane(const uint8* src_y, int src_stride_y,
                  uint8* dst_y, int dst_stride_y,
                  int width, int height) {
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
@@ -222,16 +227,14 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
     src_stride_yuy2 = -src_stride_yuy2;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_yuy2 == width * 2 &&
       dst_stride_y == width &&
       dst_stride_u * 2 == width &&
       dst_stride_v * 2 == width) {
-    return YUY2ToI422(src_yuy2, 0,
-                      dst_y, 0,
-                      dst_u, 0,
-                      dst_v, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
   void (*YUY2ToUV422Row)(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int pix);
@@ -302,16 +305,14 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
     src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
     src_stride_uyvy = -src_stride_uyvy;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_uyvy == width * 2 &&
       dst_stride_y == width &&
       dst_stride_u * 2 == width &&
       dst_stride_v * 2 == width) {
-    return UYVYToI422(src_uyvy, 0,
-                      dst_y, 0,
-                      dst_u, 0,
-                      dst_v, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
   void (*UYVYToUV422Row)(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int pix);
@@ -509,14 +510,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb0 == width * 4 &&
       src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBBlend(src_argb0, 0,
-                     src_argb1, 0,
-                     dst_argb, 0,
-                     width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
   void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
                        uint8* dst_argb, int width) = GetARGBBlend();
@@ -545,16 +545,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb0 == width * 4 &&
       src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBMultiply(src_argb0, 0,
-                        src_argb1, 0,
-                        dst_argb, 0,
-                        width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-
   void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
                           int width) = ARGBMultiplyRow_C;
 #if defined(HAS_ARGBMULTIPLYROW_SSE2)
@@ -607,16 +605,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb0 == width * 4 &&
       src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBAdd(src_argb0, 0,
-                   src_argb1, 0,
-                   dst_argb, 0,
-                   width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-
   void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
                      int width) = ARGBAddRow_C;
 #if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
@@ -674,16 +670,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb0 == width * 4 &&
       src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBSubtract(src_argb0, 0,
-                        src_argb1, 0,
-                        dst_argb, 0,
-                        width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
-
   void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
                           int width) = ARGBSubtractRow_C;
 #if defined(HAS_ARGBSUBTRACTROW_SSE2)
@@ -739,16 +733,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
     dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
     dst_stride_bgra = -dst_stride_bgra;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u * 2 == width &&
       src_stride_v * 2 == width &&
       dst_stride_bgra == width * 4) {
-    return I422ToBGRA(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_bgra, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
   }
   void (*I422ToBGRARow)(const uint8* y_buf,
                         const uint8* u_buf,
@@ -810,16 +802,14 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
     dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
     dst_stride_abgr = -dst_stride_abgr;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u * 2 == width &&
       src_stride_v * 2 == width &&
       dst_stride_abgr == width * 4) {
-    return I422ToABGR(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_abgr, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
   }
   void (*I422ToABGRRow)(const uint8* y_buf,
                         const uint8* u_buf,
@@ -873,16 +863,14 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
     dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
     dst_stride_rgba = -dst_stride_rgba;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_y == width &&
       src_stride_u * 2 == width &&
       src_stride_v * 2 == width &&
       dst_stride_rgba == width * 4) {
-    return I422ToRGBA(src_y, 0,
-                      src_u, 0,
-                      src_v, 0,
-                      dst_rgba, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
   }
   void (*I422ToRGBARow)(const uint8* y_buf,
                         const uint8* u_buf,
@@ -1016,12 +1004,11 @@ LIBYUV_API
 void SetPlane(uint8* dst_y, int dst_stride_y,
               int width, int height,
               uint32 value) {
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (dst_stride_y == width) {
-    SetPlane(dst_y, 0,
-             width * height, 1,
-             value);
-    return;
+    width *= height;
+    height = 1;
+    dst_stride_y = 0;
   }
   void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
 #if defined(HAS_SETROW_NEON)
@@ -1084,27 +1071,27 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
       dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  // Coalesce contiguous rows.
+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+  // Coalesce rows.
   if (dst_stride_argb == width * 4) {
-    return ARGBRect(dst_argb, dst_stride_argb,
-                    dst_x, dst_y,
-                    width * height, 1, value);
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
   }
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
 #if defined(HAS_SETROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBSetRows_NEON(dst, value, width, dst_stride_argb, height);
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
     return 0;
   }
 #endif
 #if defined(HAS_SETROW_X86)
   if (TestCpuFlag(kCpuHasX86)) {
-    ARGBSetRows_X86(dst, value, width, dst_stride_argb, height);
+    ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
     return 0;
   }
 #endif
-  ARGBSetRows_C(dst, value, width, dst_stride_argb, height);
+  ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
   return 0;
 }
 
@@ -1133,12 +1120,12 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBAttenuate(src_argb, 0,
-                         dst_argb, 0,
-                         width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
   }
   void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
                            int width) = ARGBAttenuateRow_C;
@@ -1153,9 +1140,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
     ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
     if (IS_ALIGNED(width, 4)) {
       ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@@ -1200,19 +1185,17 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBUnattenuate(src_argb, 0,
-                           dst_argb, 0,
-                           width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
   }
   void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
                              int width) = ARGBUnattenuateRow_C;
 #if defined(HAS_ARGBUNATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
     ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
     if (IS_ALIGNED(width, 4)) {
       ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
@@ -1250,12 +1233,12 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBGrayTo(src_argb, 0,
-                      dst_argb, 0,
-                      width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
   }
   void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
                       int width) = ARGBGrayRow_C;
@@ -1287,11 +1270,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (dst_stride_argb == width * 4) {
-    return ARGBGray(dst_argb, dst_stride_argb,
-                    dst_x, dst_y,
-                    width * height, 1);
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
   }
   void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
                       int width) = ARGBGrayRow_C;
@@ -1320,11 +1303,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (dst_stride_argb == width * 4) {
-    return ARGBSepia(dst_argb, dst_stride_argb,
-                     dst_x, dst_y,
-                     width * height, 1);
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
   }
   void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
 #if defined(HAS_ARGBSEPIAROW_SSSE3)
@@ -1345,24 +1328,30 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
   return 0;
 }
 
-// Apply a 4x3 matrix rotation to each ARGB pixel.
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
 LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+                    uint8* dst_argb, int dst_stride_argb,
                     const int8* matrix_argb,
-                    int dst_x, int dst_y, int width, int height) {
-  if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+                    int width, int height) {
+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
-  // Coalesce contiguous rows.
-  if (dst_stride_argb == width * 4) {
-    return ARGBColorMatrix(dst_argb, dst_stride_argb,
-                           matrix_argb,
-                           dst_x, dst_y,
-                           width * height, 1);
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
   }
-  void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
-                             int width) = ARGBColorMatrixRow_C;
+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
 #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -1373,14 +1362,48 @@ int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
 #endif
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   for (int y = 0; y < height; ++y) {
-    ARGBColorMatrixRow(dst, matrix_argb, width);
-    dst += dst_stride_argb;
+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
   }
   return 0;
 }
 
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API SAFEBUFFERS
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+                   const int8* matrix_rgb,
+                   int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+
+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+  SIMD_ALIGNED(int8 matrix_argb[16]);
+  matrix_argb[0] = matrix_rgb[0] / 2;
+  matrix_argb[1] = matrix_rgb[1] / 2;
+  matrix_argb[2] = matrix_rgb[2] / 2;
+  matrix_argb[3] = matrix_rgb[3] / 2;
+  matrix_argb[4] = matrix_rgb[4] / 2;
+  matrix_argb[5] = matrix_rgb[5] / 2;
+  matrix_argb[6] = matrix_rgb[6] / 2;
+  matrix_argb[7] = matrix_rgb[7] / 2;
+  matrix_argb[8] = matrix_rgb[8] / 2;
+  matrix_argb[9] = matrix_rgb[9] / 2;
+  matrix_argb[10] = matrix_rgb[10] / 2;
+  matrix_argb[11] = matrix_rgb[11] / 2;
+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+  matrix_argb[15] = 64;  // 1.0
+
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  return ARGBColorMatrix(const_cast<const uint8*>(dst), dst_stride_argb,
+                         dst, dst_stride_argb,
+                         &matrix_argb[0], width, height);
+}
+
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
@@ -1391,12 +1414,11 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
       dst_x < 0 || dst_y < 0) {
     return -1;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (dst_stride_argb == width * 4) {
-    return ARGBColorTable(dst_argb, dst_stride_argb,
-                          table_argb,
-                          dst_x, dst_y,
-                          width * height, 1);
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
   }
   void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
                             int width) = ARGBColorTableRow_C;
@@ -1413,6 +1435,37 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
   return 0;
 }
 
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+                  const uint8* table_argb,
+                  int dst_x, int dst_y, int width, int height) {
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+      dst_x < 0 || dst_y < 0) {
+    return -1;
+  }
+  // Coalesce rows.
+  if (dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
+  }
+  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+                           int width) = RGBColorTableRow_C;
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+  if (TestCpuFlag(kCpuHasX86)) {
+    RGBColorTableRow = RGBColorTableRow_X86;
+  }
+#endif
+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  for (int y = 0; y < height; ++y) {
+    RGBColorTableRow(dst, table_argb, width);
+    dst += dst_stride_argb;
+  }
+  return 0;
+}
+
 // ARGBQuantize is used to posterize art.
 // e.g. rgb / qvalue * qvalue + qvalue / 2
 // But the low levels implement efficiently with 3 parameters, and could be
@@ -1430,12 +1483,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
       interval_size < 1 || interval_size > 255) {
     return -1;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (dst_stride_argb == width * 4) {
-    return ARGBQuantize(dst_argb, dst_stride_argb,
-                        scale, interval_size, interval_offset,
-                        dst_x, dst_y,
-                        width * height, 1);
+    width *= height;
+    height = 1;
+    dst_stride_argb = 0;
   }
   void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
@@ -1496,14 +1548,28 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  if (radius > height) {
+    radius = height;
+  }
+  if (radius > (width / 2 - 1)) {
+    radius = width / 2 - 1;
+  }
+  if (radius <= 0) {
+    return -1;
+  }
+  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
       const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  void (*CUMULATIVESUMTOAVERAGEROW)(const int32* topleft, const int32* botleft,
+  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
       int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
 #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
-    CUMULATIVESUMTOAVERAGEROW = CumulativeSumToAverageRow_SSE2;
+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
   }
 #endif
   // Compute enough CumulativeSum for first row to be blurred. After this
@@ -1548,24 +1614,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
     int boxwidth = radius * 4;
     int x;
     for (x = 0; x < radius + 1; ++x) {
-      CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row,
-                              boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                                boxwidth, area, &dst_argb[x * 4], 1);
       area += (bot_y - top_y);
       boxwidth += 4;
     }
 
     // Middle unclipped.
     int n = (width - 1) - radius - x + 1;
-    CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row,
-                           boxwidth, area, &dst_argb[x * 4], n);
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+                              boxwidth, area, &dst_argb[x * 4], n);
 
     // Right clipped.
     for (x += n; x <= width - 1; ++x) {
       area -= (bot_y - top_y);
       boxwidth -= 4;
-      CUMULATIVESUMTOAVERAGEROW(cumsum_top_row + (x - radius - 1) * 4,
-                             cumsum_bot_row + (x - radius - 1) * 4,
-                             boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+                                cumsum_bot_row + (x - radius - 1) * 4,
+                                boxwidth, area, &dst_argb[x * 4], 1);
     }
     dst_argb += dst_stride_argb;
   }
@@ -1585,13 +1651,12 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBShade(src_argb, 0,
-                     dst_argb, 0,
-                     width * height, 1,
-                     value);
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
   }
   void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
                        int width, uint32 value) = ARGBShadeRow_C;
@@ -1616,8 +1681,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
 }
 
 // Interpolate 2 ARGB images by specified amount (0 to 255).
-// TODO(fbarchard): Consider selecting a specialization for interpolation so
-// row function doesn't need to check interpolation on each row.
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                     const uint8* src_argb1, int src_stride_argb1,
@@ -1632,15 +1695,13 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
     dst_stride_argb = -dst_stride_argb;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_argb0 == width * 4 &&
       src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBInterpolate(src_argb0, 0,
-                           src_argb1, 0,
-                           dst_argb, 0,
-                           width * height, 1,
-                           interpolation);
+    width *= height;
+    height = 1;
+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
   }
   void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
@@ -1671,6 +1732,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
     InterpolateRow = InterpolateRow_Any_NEON;
@@ -1713,16 +1782,23 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
     src_stride_bgra = -src_stride_bgra;
   }
-  // Coalesce contiguous rows.
+  // Coalesce rows.
   if (src_stride_bgra == width * 4 &&
       dst_stride_argb == width * 4) {
-    return ARGBShuffle(src_bgra, 0,
-                       dst_argb, 0,
-                       shuffler,
-                       width * height, 1);
+    width *= height;
+    height = 1;
+    src_stride_bgra = dst_stride_argb = 0;
   }
   void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
                          const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBShuffleRow = ARGBShuffleRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBSHUFFLEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
@@ -1761,12 +1837,17 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
 }
 
 // Sobel ARGB effect.
-LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+static SAFEBUFFERS
+int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+                 uint8* dst_argb, int dst_stride_argb,
+                 int width, int height,
+                 void (*SobelRow)(const uint8* src_sobelx,
+                                  const uint8* src_sobely,
+                                  uint8* dst, int width)) {
+  const int kMaxRow = kMaxStride / 4;
+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
   if (!src_argb  || !dst_argb ||
-      width <= 0 || height == 0 || width > (kMaxStride / 4)) {
+      width <= 0 || height == 0 || width > (kMaxRow - kEdge)) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1777,7 +1858,16 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
   }
   // ARGBToBayer used to select G channel from ARGB.
   void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
+                         uint32 selector, int pix) = ARGBToBayerGGRow_C;
+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+    ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
@@ -1786,19 +1876,20 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
       ARGBToBayerRow = ARGBToBayerRow_SSSE3;
     }
   }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
+#endif
+#if defined(HAS_ARGBTOBAYERGGROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+    ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
     if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
+      ARGBToBayerRow = ARGBToBayerGGRow_NEON;
     }
   }
 #endif
   void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) = SobelYRow_C;
-#if defined(HAS_SOBELYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SobelYRow = SobelYRow_SSSE3;
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
   }
 #endif
 #if defined(HAS_SOBELYROW_NEON)
@@ -1809,9 +1900,9 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
   void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobely, int width) =
       SobelXRow_C;
-#if defined(HAS_SOBELXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SobelXRow = SobelXRow_SSSE3;
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
   }
 #endif
 #if defined(HAS_SOBELXROW_NEON)
@@ -1819,35 +1910,22 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
     SobelXRow = SobelXRow_NEON;
   }
 #endif
-  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) = SobelRow_C;
-#if defined(HAS_SOBELROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    SobelRow = SobelRow_SSE2;
-  }
-#endif
-#if defined(HAS_SOBELROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    SobelRow = SobelRow_NEON;
-  }
-#endif
-
-  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
-  SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]);
-  SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]);
+  // 3 rows with edges before/after.
+  SIMD_ALIGNED(uint8 row_y[kEdge + kMaxRow * 3]);
+  SIMD_ALIGNED(uint8 row_sobelx[kMaxRow]);
+  SIMD_ALIGNED(uint8 row_sobely[kMaxRow]);
 
   // Convert first row.
   uint8* row_y0 = row_y + kEdge;
-  uint8* row_y1 = row_y0 + kMaxStride / 4;
-  uint8* row_y2 = row_y1 + kMaxStride / 4;
+  uint8* row_y1 = row_y0 + kMaxRow;
+  uint8* row_y2 = row_y1 + kMaxRow;
   ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
   row_y0[-1] = row_y0[0];
-  row_y0[width] = row_y0[width - 1];
+  memset(row_y0 + width, row_y0[width - 1], 16);  // extrude 16 pixels.
   ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
   row_y1[-1] = row_y1[0];
-  row_y1[width] = row_y1[width - 1];
+  memset(row_y1 + width, row_y1[width - 1], 16);
+  memset(row_y2 + width, 0, 16);
 
   for (int y = 0; y < height; ++y) {
     // Convert next row of ARGB to Y.
@@ -1873,14 +1951,80 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+              uint8* dst_argb, int dst_stride_argb,
+              int width, int height) {
+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                   uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    SobelRow = SobelRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    SobelRow = SobelRow_NEON;
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+                     uint8* dst_y, int dst_stride_y,
+                     int width, int height) {
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+    SobelToPlaneRow = SobelToPlaneRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    SobelToPlaneRow = SobelToPlaneRow_NEON;
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+                      width, height, SobelToPlaneRow);
+}
+
 // SobelXY ARGB effect.
 // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
 LIBYUV_API
 int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
                 uint8* dst_argb, int dst_stride_argb,
                 int width, int height) {
-  if (!src_argb  || !dst_argb ||
-      width <= 0 || height == 0 || width > kMaxStride / 4) {
+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                     uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+    SobelXYRow = SobelXYRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+    SobelXYRow = SobelXYRow_NEON;
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                      width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+                   uint8* dst_argb, int dst_stride_argb,
+                   const float* poly,
+                   int width, int height) {
+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1889,99 +2033,156 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
     src_argb  = src_argb  + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
-  // ARGBToBayer used to select G channel from ARGB.
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
-                         uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
-    }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
   }
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerRow_NEON;
-    }
+  void (*ARGBPolynomialRow)(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) = ARGBPolynomialRow_C;
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
   }
 #endif
-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) = SobelYRow_C;
-#if defined(HAS_SOBELYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SobelYRow = SobelYRow_SSSE3;
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+      IS_ALIGNED(width, 2)) {
+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
   }
 #endif
-#if defined(HAS_SOBELYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelYRow = SobelYRow_NEON;
+  for (int y = 0; y < height; ++y) {
+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+                       uint8* dst_argb, int dst_stride_argb,
+                       const uint8* luma,
+                       int width, int height) {
+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+      int width, const uint8* luma, const uint32 lumacoeff) =
+      ARGBLumaColorTableRow_C;
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
   }
 #endif
-  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobely, int width) =
-      SobelXRow_C;
-#if defined(HAS_SOBELXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SobelXRow = SobelXRow_SSSE3;
+  for (int y = 0; y < height; ++y) {
+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+                  uint8* dst_argb, int dst_stride_argb,
+                  int width, int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_argb = 0;
+  }
+  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+      ARGBCopyAlphaRow_C;
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
+      IS_ALIGNED(width, 8)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
   }
 #endif
-#if defined(HAS_SOBELXROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    SobelXRow = SobelXRow_NEON;
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
   }
 #endif
-  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) = SobelXYRow_C;
-#if defined(HAS_SOBELXYROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
-      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    SobelXYRow = SobelXYRow_SSE2;
+  for (int y = 0; y < height; ++y) {
+    ARGBCopyAlphaRow(src_argb, dst_argb, width);
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+                     uint8* dst_argb, int dst_stride_argb,
+                     int width, int height) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width &&
+      dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_argb = 0;
+  }
+  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+      ARGBCopyYToAlphaRow_C;
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
+      IS_ALIGNED(width, 8)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
   }
 #endif
-#if defined(HAS_SOBELXYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    SobelXYRow = SobelXYRow_NEON;
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
   }
 #endif
-
-  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
-  SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]);
-  SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]);
-
-  // Convert first row.
-  uint8* row_y0 = row_y + kEdge;
-  uint8* row_y1 = row_y0 + kMaxStride / 4;
-  uint8* row_y2 = row_y1 + kMaxStride / 4;
-  ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
-  row_y0[-1] = row_y0[0];
-  row_y0[width] = row_y0[width - 1];
-  ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
-  row_y1[-1] = row_y1[0];
-  row_y1[width] = row_y1[width - 1];
-
   for (int y = 0; y < height; ++y) {
-    // Convert next row of ARGB to Y.
-    if (y < (height - 1)) {
-      src_argb += src_stride_argb;
-    }
-    ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
-    row_y2[-1] = row_y2[0];
-    row_y2[width] = row_y2[width - 1];
-
-    SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
-    SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
-    SobelXYRow(row_sobelx, row_sobely, dst_argb, width);
-
-    // Cycle thru circular queue of 3 row_y buffers.
-    uint8* row_yt = row_y0;
-    row_y0 = row_y1;
-    row_y1 = row_y2;
-    row_y2 = row_yt;
-
+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+    src_y += src_stride_y;
     dst_argb += dst_stride_argb;
   }
   return 0;
diff --git a/chromium/third_party/libyuv/source/rotate.cc b/chromium/third_party/libyuv/source/rotate.cc
index c46650b4458..b99cde10891 100644
--- a/chromium/third_party/libyuv/source/rotate.cc
+++ b/chromium/third_party/libyuv/source/rotate.cc
@@ -41,7 +41,7 @@ extern "C" {
 #endif
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && \
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_MIRRORROW_NEON
 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
@@ -57,7 +57,8 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
                          int width);
 #endif  // defined(__ARM_NEON__)
 
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+    defined(__mips__) && \
     defined(__mips_dsp) && (__mips_dsp_rev >= 2)
 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
@@ -72,7 +73,8 @@ void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
                                int width);
 #endif  // defined(__mips__)
 
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && \
+    defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_TRANSPOSE_WX8_SSSE3
 __declspec(naked) __declspec(align(16))
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
@@ -89,7 +91,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
 
     // Read in the data from the source pointer.
     // First round of bit swap.
-    align      16
+    align      4
  convertloop:
     movq      xmm0, qword ptr [eax]
     lea       ebp, [eax + 8]
@@ -188,7 +190,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     mov       [esp + 16], ecx
     mov       ecx, [ecx + 16 + 28]  // w
 
-    align      16
+    align      4
  convertloop:
     // Read in the data from the source pointer.
     // First round of bit swap.
@@ -294,14 +296,15 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     ret
   }
 }
-#elif !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
+#elif !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
 #define HAS_TRANSPOSE_WX8_SSSE3
 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
                                uint8* dst, int dst_stride, int width) {
   asm volatile (
     // Read in the data from the source pointer.
     // First round of bit swap.
-    ".p2align  4                                 \n"
+    ".p2align  2                                 \n"
   "1:                                            \n"
     "movq       (%0),%%xmm0                      \n"
     "movq       (%0,%3),%%xmm1                   \n"
@@ -383,7 +386,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   );
 }
 
-#if !defined(LIBYUV_DISABLE_X86) && defined (__i386__)
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
 #define HAS_TRANSPOSE_UVWX8_SSE2
 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
                                     uint8* dst_a, int dst_stride_a,
@@ -503,9 +506,16 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     "pop    %edi                               \n"
     "pop    %esi                               \n"
     "pop    %ebx                               \n"
+#if defined(__native_client__)
+    "pop    %ecx                               \n"
+    "and    $0xffffffe0,%ecx                   \n"
+    "jmp    *%ecx                              \n"
+#else
     "ret                                       \n"
+#endif
 );
-#elif !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
+#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+    defined(__x86_64__)
 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
@@ -513,7 +523,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
   asm volatile (
   // Read in the data from the source pointer.
   // First round of bit swap.
-  ".p2align  4                                 \n"
+  ".p2align  2                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     (%0,%3),%%xmm1                   \n"
@@ -654,7 +664,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
   asm volatile (
   // Read in the data from the source pointer.
   // First round of bit swap.
-  ".p2align  4                                 \n"
+  ".p2align  2                                 \n"
 "1:                                            \n"
   "movdqa     (%0),%%xmm0                      \n"
   "movdqa     (%0,%4),%%xmm1                   \n"
@@ -857,7 +867,7 @@ void RotatePlane270(const uint8* src, int src_stride,
   TransposePlane(src, src_stride, dst, dst_stride, width, height);
 }
 
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
 void RotatePlane180(const uint8* src, int src_stride,
                     uint8* dst, int dst_stride,
                     int width, int height) {
diff --git a/chromium/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libyuv/source/rotate_argb.cc
index 5fa0d7ea798..b95512783a0 100644
--- a/chromium/third_party/libyuv/source/rotate_argb.cc
+++ b/chromium/third_party/libyuv/source/rotate_argb.cc
@@ -22,14 +22,15 @@ extern "C" {
 
 // ARGBScale has a function to copy pixels to a row, striding each source
 // pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
-  defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || \
+    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
 #define HAS_SCALEARGBROWDOWNEVEN_SSE2
 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
                                int src_stepx,
                                uint8* dst_ptr, int dst_width);
 #endif
-#if !defined(LIBYUV_DISABLE_NEON) && \
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEARGBROWDOWNEVEN_NEON
 void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
@@ -88,6 +89,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
   ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
 }
 
+SAFEBUFFERS
 void ARGBRotate180(const uint8* src, int src_stride,
                    uint8* dst, int dst_stride,
                    int width, int height) {
diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc
index ab07c169703..a59c4d5fde6 100644
--- a/chromium/third_party/libyuv/source/rotate_neon.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon.cc
@@ -18,7 +18,7 @@ extern "C" {
 #endif
 
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-static const uvec8 kVTbl4x4Transpose =
+static uvec8 kVTbl4x4Transpose =
   { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
 
 void TransposeWx8_NEON(const uint8* src, int src_stride,
@@ -31,7 +31,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
     "sub         %4, #8                        \n"
 
     // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
     "1:                                        \n"
       "mov         r9, %0                      \n"
 
@@ -184,7 +184,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
   );
 }
 
-static const uvec8 kVTbl4x4TransposeDi =
+static uvec8 kVTbl4x4TransposeDi =
   { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
 
 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
@@ -198,7 +198,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
     "sub         %6, #8                        \n"
 
     // handle 8x8 blocks. this should be the majority of the plane
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
     "1:                                        \n"
       "mov         r9, %0                      \n"
 
diff --git a/chromium/third_party/libyuv/source/row_any.cc b/chromium/third_party/libyuv/source/row_any.cc
index 72100d90e9d..90c6a3ff5f8 100644
--- a/chromium/third_party/libyuv/source/row_any.cc
+++ b/chromium/third_party/libyuv/source/row_any.cc
@@ -137,8 +137,12 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
        3, 4, 2)
 RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
        3, 4, 2)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
 RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
        7, 1, 4)
+#endif
+#if defined(HAS_YTOARGBROW_SSE2)
 RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
        7, 1, 4)
 RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
@@ -195,6 +199,15 @@ BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
 BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
          7, 4, 1)
 #endif
+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
+         7, 4, 1)
+#endif
+#if defined(HAS_ARGBTOBAYERGGROW_NEON)
+BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
+         7, 4, 1)
+#endif
+
 #undef BAYERANY
 
 // RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
@@ -213,6 +226,8 @@ YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
 #endif
 #ifdef HAS_ARGBTOYROW_SSSE3
 YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
 YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
 YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
 YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
@@ -290,7 +305,7 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
                 width & MASK);                                                 \
     }
 
-#ifdef HAS_ARGBTOYROW_AVX2
+#ifdef HAS_ARGBTOUVROW_AVX2
 UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
 UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
 UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
@@ -468,6 +483,10 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
                 dst_argb  + n * BPP, shuffler, width & MASK);                  \
     }
 
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
+     ARGBShuffleRow_C, 4, 4, 3)
+#endif
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
      ARGBShuffleRow_C, 4, 4, 7)
@@ -495,6 +514,10 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
              width & MASK, source_y_fraction);                                 \
     }
 
+#ifdef HAS_INTERPOLATEROW_AVX2
+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
+     InterpolateRow_C, 1, 1, 32)
+#endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
 NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
      InterpolateRow_C, 1, 1, 15)
diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc
index badea440582..f961696f008 100644
--- a/chromium/third_party/libyuv/source/row_common.cc
+++ b/chromium/third_party/libyuv/source/row_common.cc
@@ -59,6 +59,11 @@ static __inline uint32 Abs(int32 v) {
 }
 #endif  // USE_BRANCHLESS
 
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return static_cast<int>((static_cast<int64>(num) << 16) / div);
+}
+
 #ifdef LIBYUV_LITTLE_ENDIAN
 #define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
 #else
@@ -649,21 +654,27 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 }
 
 // Apply color matrix to a row of image. Matrix is signed.
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+                          const int8* matrix_argb, int width) {
   for (int x = 0; x < width; ++x) {
-    int b = dst_argb[0];
-    int g = dst_argb[1];
-    int r = dst_argb[2];
-    int a = dst_argb[3];
+    int b = src_argb[0];
+    int g = src_argb[1];
+    int r = src_argb[2];
+    int a = src_argb[3];
     int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
     int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
     int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
     dst_argb[0] = Clamp(sb);
     dst_argb[1] = Clamp(sg);
     dst_argb[2] = Clamp(sr);
+    dst_argb[3] = Clamp(sa);
+    src_argb += 4;
     dst_argb += 4;
   }
 }
@@ -683,6 +694,19 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
   }
 }
 
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+  for (int x = 0; x < width; ++x) {
+    int b = dst_argb[0];
+    int g = dst_argb[1];
+    int r = dst_argb[2];
+    dst_argb[0] = table_argb[b * 4 + 0];
+    dst_argb[1] = table_argb[g * 4 + 1];
+    dst_argb[2] = table_argb[r * 4 + 2];
+    dst_argb += 4;
+  }
+}
+
 void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
                        int interval_offset, int width) {
   for (int x = 0; x < width; ++x) {
@@ -845,6 +869,16 @@ void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
   }
 }
 
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+                       uint8* dst_y, int width) {
+  for (int i = 0; i < width; ++i) {
+    int r = src_sobelx[i];
+    int b = src_sobely[i];
+    int s = clamp255(r + b);
+    dst_y[i] = static_cast<uint8>(s);
+  }
+}
+
 void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
                   uint8* dst_argb, int width) {
   for (int i = 0; i < width; ++i) {
@@ -1670,7 +1704,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // Reciprocal method is off by 1 on some values. ie 125
 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
 #define T(a) 0x01000000 + (0x10000 / a)
-uint32 fixed_invtbl8[256] = {
+const uint32 fixed_invtbl8[256] = {
   0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
   T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
   T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
@@ -1774,10 +1808,26 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
   }
 }
 
+// Blend 2 rows into 1 for conversions such as I422ToI420.
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+               uint8* dst_uv, int pix) {
+  for (int x = 0; x < pix; ++x) {
+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+  }
+}
+
 // C version 2x2 -> 2x1.
 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
                       ptrdiff_t src_stride,
                       int width, int source_y_fraction) {
+  if (source_y_fraction == 0) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+  if (source_y_fraction == 128) {
+    HalfRow_C(src_ptr, static_cast<int>(src_stride), dst_ptr, width);
+    return;
+  }
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8* src_ptr1 = src_ptr + src_stride;
@@ -1794,14 +1844,6 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-// Blend 2 rows into 1 for conversions such as I422ToI420.
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
-               uint8* dst_uv, int pix) {
-  for (int x = 0; x < pix; ++x) {
-    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
-  }
-}
-
 // Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG
 void ARGBToBayerRow_C(const uint8* src_argb,
                       uint8* dst_bayer, uint32 selector, int pix) {
@@ -1819,6 +1861,21 @@ void ARGBToBayerRow_C(const uint8* src_argb,
   }
 }
 
+// Select G channel from ARGB.  e.g.  GGGGGGGG
+void ARGBToBayerGGRow_C(const uint8* src_argb,
+                        uint8* dst_bayer, uint32 /*selector*/, int pix) {
+  // Copy a row of G.
+  for (int x = 0; x < pix - 1; x += 2) {
+    dst_bayer[0] = src_argb[1];
+    dst_bayer[1] = src_argb[5];
+    src_argb += 8;
+    dst_bayer += 2;
+  }
+  if (pix & 1) {
+    dst_bayer[0] = src_argb[1];
+  }
+}
+
 // Use first 4 shuffler values to reorder ARGB channels.
 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
                       const uint8* shuffler, int pix) {
@@ -1886,10 +1943,19 @@ void I422ToUYVYRow_C(const uint8* src_y,
     }
 }
 
-#if !defined(LIBYUV_DISABLE_X86)
+// TODO(fbarchard): Ensure these are stack safe.
+#ifdef DEBUG
+#define MAYBE_SAFEBUFFERS
+#else
+#define MAYBE_SAFEBUFFERS SAFEBUFFERS
+#endif
+
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.  5% slower.
 // TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
 #if defined(__x86_64__) || defined(__i386__)
+MAYBE_SAFEBUFFERS
 void I422ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_u,
                            const uint8* src_v,
@@ -1902,6 +1968,7 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+MAYBE_SAFEBUFFERS
 void I422ToARGB1555Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
@@ -1912,6 +1979,7 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y,
   ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
 }
 
+MAYBE_SAFEBUFFERS
 void I422ToARGB4444Row_SSSE3(const uint8* src_y,
                              const uint8* src_u,
                              const uint8* src_v,
@@ -1922,6 +1990,7 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
   ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
 }
 
+MAYBE_SAFEBUFFERS
 void NV12ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_uv,
                            uint8* dst_rgb565,
@@ -1931,6 +2000,7 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y,
   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
 }
 
+MAYBE_SAFEBUFFERS
 void NV21ToRGB565Row_SSSE3(const uint8* src_y,
                            const uint8* src_vu,
                            uint8* dst_rgb565,
@@ -1940,6 +2010,7 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y,
   ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
 }
 
+MAYBE_SAFEBUFFERS
 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
                          uint8* dst_argb,
                          int width) {
@@ -1951,6 +2022,7 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }
 
+MAYBE_SAFEBUFFERS
 void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
                                    uint8* dst_argb,
                                    int width) {
@@ -1962,6 +2034,7 @@ void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
   I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }
 
+MAYBE_SAFEBUFFERS
 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
                          uint8* dst_argb,
                          int width) {
@@ -1973,6 +2046,7 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
   I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
 }
 
+MAYBE_SAFEBUFFERS
 void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
                                    uint8* dst_argb,
                                    int width) {
@@ -1986,8 +2060,102 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
 
 #endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
 #endif  // !defined(LIBYUV_DISABLE_X86)
-#undef clamp0
-#undef clamp255
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+                         uint8* dst_argb, const float* poly,
+                         int width) {
+  for (int i = 0; i < width; ++i) {
+    float b = static_cast<float>(src_argb[0]);
+    float g = static_cast<float>(src_argb[1]);
+    float r = static_cast<float>(src_argb[2]);
+    float a = static_cast<float>(src_argb[3]);
+    float b2 = b * b;
+    float g2 = g * g;
+    float r2 = r * r;
+    float a2 = a * a;
+    float db = poly[0] + poly[4] * b;
+    float dg = poly[1] + poly[5] * g;
+    float dr = poly[2] + poly[6] * r;
+    float da = poly[3] + poly[7] * a;
+    db += poly[8] * b2;
+    dg += poly[9] * g2;
+    dr += poly[10] * r2;
+    da += poly[11] * a2;
+    float b3 = b2 * b;
+    float g3 = g2 * g;
+    float r3 = r2 * r;
+    float a3 = a2 * a;
+    db += poly[12] * b3;
+    dg += poly[13] * g3;
+    dr += poly[14] * r3;
+    da += poly[15] * a3;
+
+    dst_argb[0] = Clamp(static_cast<int32>(db));
+    dst_argb[1] = Clamp(static_cast<int32>(dg));
+    dst_argb[2] = Clamp(static_cast<int32>(dr));
+    dst_argb[3] = Clamp(static_cast<int32>(da));
+    src_argb += 4;
+    dst_argb += 4;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+                             const uint8* luma, const uint32 lumacoeff) {
+  uint32 bc = lumacoeff & 0xff;
+  uint32 gc = (lumacoeff >> 8) & 0xff;
+  uint32 rc = (lumacoeff >> 16) & 0xff;
+
+  for (int i = 0; i < width - 1; i += 2) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+    const uint8* luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+                           src_argb[6] * rc) & 0x7F00u) + luma;
+    dst_argb[4] = luma1[src_argb[4]];
+    dst_argb[5] = luma1[src_argb[5]];
+    dst_argb[6] = luma1[src_argb[6]];
+    dst_argb[7] = src_argb[7];
+    src_argb += 8;
+    dst_argb += 8;
+  }
+  if (width & 1) {
+    // Luminance in rows, color values in columns.
+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+                           src_argb[2] * rc) & 0x7F00u) + luma;
+    dst_argb[0] = luma0[src_argb[0]];
+    dst_argb[1] = luma0[src_argb[1]];
+    dst_argb[2] = luma0[src_argb[2]];
+    dst_argb[3] = src_argb[3];
+  }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  for (int i = 0; i < width - 1; i += 2) {
+    dst[3] = src[3];
+    dst[7] = src[7];
+    dst += 8;
+    src += 8;
+  }
+  if (width & 1) {
+    dst[3] = src[3];
+  }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+  for (int i = 0; i < width - 1; i += 2) {
+    dst[3] = src[0];
+    dst[7] = src[1];
+    dst += 8;
+    src += 2;
+  }
+  if (width & 1) {
+    dst[3] = src[0];
+  }
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/chromium/third_party/libyuv/source/row_mips.cc b/chromium/third_party/libyuv/source/row_mips.cc
index 69677aa2d5b..4435c55c5ce 100644
--- a/chromium/third_party/libyuv/source/row_mips.cc
+++ b/chromium/third_party/libyuv/source/row_mips.cc
@@ -15,6 +15,9 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+
 #ifdef HAS_COPYROW_MIPS
 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
   __asm__ __volatile__ (
@@ -383,6 +386,7 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "blez            $t4, 2f                       \n"
     " andi           %[width], %[width], 0xf       \n"  // residual
 
+    ".p2align        2                             \n"
   "1:                                              \n"
     "addiu           $t4, $t4, -1                  \n"
     "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
@@ -449,6 +453,7 @@ void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
     "blez            $t4, 2f                       \n"
     " andi           %[width], %[width], 0xf       \n"  // residual
 
+    ".p2align        2                             \n"
   "1:                                              \n"
     "addiu           $t4, $t4, -1                  \n"
     "lwr             $t0, 0(%[src_uv])             \n"
@@ -532,7 +537,8 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
     "blez      $t4, 2f                     \n"
     " addu     %[src], %[src], %[width]    \n"  // src += width
 
-    "1:                                      \n"
+    ".p2align  2                           \n"
+   "1:                                     \n"
     "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
     "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
     "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
@@ -556,7 +562,7 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
     "beqz      $t5, 3f                     \n"
     " nop                                  \n"
 
-    "2:                                      \n"
+   "2:                                     \n"
     "lbu       $t0, -1(%[src])             \n"
     "addiu     $t5, $t5, -1                \n"
     "addiu     %[src], %[src], -1          \n"
@@ -564,7 +570,7 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
     "bgez      $t5, 2b                     \n"
     " addiu    %[dst], %[dst], 1           \n"
 
-    "3:                                      \n"
+   "3:                                     \n"
     ".set pop                              \n"
       : [src] "+r" (src), [dst] "+r" (dst)
       : [width] "r" (width)
@@ -586,7 +592,8 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "blez            %[x], 2f                     \n"
     " addu           %[src_uv], %[src_uv], $t4    \n"
 
-    "1:                                             \n"
+    ".p2align        2                            \n"
+   "1:                                            \n"
     "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
     "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
     "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
@@ -638,7 +645,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "b               2f                           \n"
     " nop                                         \n"
 
-    "2:                                             \n"
+   "2:                                            \n"
     "lbu             $t0, -2(%[src_uv])           \n"
     "lbu             $t1, -1(%[src_uv])           \n"
     "addiu           %[src_uv], %[src_uv], -2     \n"
@@ -649,7 +656,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "bgtz            %[y], 2b                     \n"
     " addiu          %[dst_v], %[dst_v], 1        \n"
 
-    "3:                                             \n"
+   "3:                                            \n"
     ".set pop                                     \n"
       : [src_uv] "+r" (src_uv),
         [dst_u] "+r" (dst_u),
@@ -670,62 +677,62 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 // t2 = | 0 | R0 | 0 | r0 |
 // t1 = | 0 | R1 | 0 | r1 |
 #define I422ToTransientMipsRGB                                                 \
-      "lw                $t0, 0(%[y_buf])       \n"                          \
-      "lhu               $t1, 0(%[u_buf])       \n"                          \
-      "lhu               $t2, 0(%[v_buf])       \n"                          \
-      "preceu.ph.qbr     $t1, $t1               \n"                          \
-      "preceu.ph.qbr     $t2, $t2               \n"                          \
-      "preceu.ph.qbra    $t3, $t0               \n"                          \
-      "preceu.ph.qbla    $t0, $t0               \n"                          \
-      "subu.ph           $t1, $t1, $s5          \n"                          \
-      "subu.ph           $t2, $t2, $s5          \n"                          \
-      "subu.ph           $t3, $t3, $s4          \n"                          \
-      "subu.ph           $t0, $t0, $s4          \n"                          \
-      "mul.ph            $t3, $t3, $s0          \n"                          \
-      "mul.ph            $t0, $t0, $s0          \n"                          \
-      "shll.ph           $t4, $t1, 0x7          \n"                          \
-      "subu.ph           $t4, $t4, $t1          \n"                          \
-      "mul.ph            $t6, $t1, $s1          \n"                          \
-      "mul.ph            $t1, $t2, $s2          \n"                          \
-      "addq_s.ph         $t5, $t4, $t3          \n"                          \
-      "addq_s.ph         $t4, $t4, $t0          \n"                          \
-      "shra.ph           $t5, $t5, 6            \n"                          \
-      "shra.ph           $t4, $t4, 6            \n"                          \
-      "addiu             %[u_buf], 2            \n"                          \
-      "addiu             %[v_buf], 2            \n"                          \
-      "addu.ph           $t6, $t6, $t1          \n"                          \
-      "mul.ph            $t1, $t2, $s3          \n"                          \
-      "addu.ph           $t9, $t6, $t3          \n"                          \
-      "addu.ph           $t8, $t6, $t0          \n"                          \
-      "shra.ph           $t9, $t9, 6            \n"                          \
-      "shra.ph           $t8, $t8, 6            \n"                          \
-      "addu.ph           $t2, $t1, $t3          \n"                          \
-      "addu.ph           $t1, $t1, $t0          \n"                          \
-      "shra.ph           $t2, $t2, 6            \n"                          \
-      "shra.ph           $t1, $t1, 6            \n"                          \
-      "subu.ph           $t5, $t5, $s5          \n"                          \
-      "subu.ph           $t4, $t4, $s5          \n"                          \
-      "subu.ph           $t9, $t9, $s5          \n"                          \
-      "subu.ph           $t8, $t8, $s5          \n"                          \
-      "subu.ph           $t2, $t2, $s5          \n"                          \
-      "subu.ph           $t1, $t1, $s5          \n"                          \
-      "shll_s.ph         $t5, $t5, 8            \n"                          \
-      "shll_s.ph         $t4, $t4, 8            \n"                          \
-      "shll_s.ph         $t9, $t9, 8            \n"                          \
-      "shll_s.ph         $t8, $t8, 8            \n"                          \
-      "shll_s.ph         $t2, $t2, 8            \n"                          \
-      "shll_s.ph         $t1, $t1, 8            \n"                          \
-      "shra.ph           $t5, $t5, 8            \n"                          \
-      "shra.ph           $t4, $t4, 8            \n"                          \
-      "shra.ph           $t9, $t9, 8            \n"                          \
-      "shra.ph           $t8, $t8, 8            \n"                          \
-      "shra.ph           $t2, $t2, 8            \n"                          \
-      "shra.ph           $t1, $t1, 8            \n"                          \
-      "addu.ph           $t5, $t5, $s5          \n"                          \
-      "addu.ph           $t4, $t4, $s5          \n"                          \
-      "addu.ph           $t9, $t9, $s5          \n"                          \
-      "addu.ph           $t8, $t8, $s5          \n"                          \
-      "addu.ph           $t2, $t2, $s5          \n"                          \
+      "lw                $t0, 0(%[y_buf])       \n"                            \
+      "lhu               $t1, 0(%[u_buf])       \n"                            \
+      "lhu               $t2, 0(%[v_buf])       \n"                            \
+      "preceu.ph.qbr     $t1, $t1               \n"                            \
+      "preceu.ph.qbr     $t2, $t2               \n"                            \
+      "preceu.ph.qbra    $t3, $t0               \n"                            \
+      "preceu.ph.qbla    $t0, $t0               \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t3, $t3, $s4          \n"                            \
+      "subu.ph           $t0, $t0, $s4          \n"                            \
+      "mul.ph            $t3, $t3, $s0          \n"                            \
+      "mul.ph            $t0, $t0, $s0          \n"                            \
+      "shll.ph           $t4, $t1, 0x7          \n"                            \
+      "subu.ph           $t4, $t4, $t1          \n"                            \
+      "mul.ph            $t6, $t1, $s1          \n"                            \
+      "mul.ph            $t1, $t2, $s2          \n"                            \
+      "addq_s.ph         $t5, $t4, $t3          \n"                            \
+      "addq_s.ph         $t4, $t4, $t0          \n"                            \
+      "shra.ph           $t5, $t5, 6            \n"                            \
+      "shra.ph           $t4, $t4, 6            \n"                            \
+      "addiu             %[u_buf], 2            \n"                            \
+      "addiu             %[v_buf], 2            \n"                            \
+      "addu.ph           $t6, $t6, $t1          \n"                            \
+      "mul.ph            $t1, $t2, $s3          \n"                            \
+      "addu.ph           $t9, $t6, $t3          \n"                            \
+      "addu.ph           $t8, $t6, $t0          \n"                            \
+      "shra.ph           $t9, $t9, 6            \n"                            \
+      "shra.ph           $t8, $t8, 6            \n"                            \
+      "addu.ph           $t2, $t1, $t3          \n"                            \
+      "addu.ph           $t1, $t1, $t0          \n"                            \
+      "shra.ph           $t2, $t2, 6            \n"                            \
+      "shra.ph           $t1, $t1, 6            \n"                            \
+      "subu.ph           $t5, $t5, $s5          \n"                            \
+      "subu.ph           $t4, $t4, $s5          \n"                            \
+      "subu.ph           $t9, $t9, $s5          \n"                            \
+      "subu.ph           $t8, $t8, $s5          \n"                            \
+      "subu.ph           $t2, $t2, $s5          \n"                            \
+      "subu.ph           $t1, $t1, $s5          \n"                            \
+      "shll_s.ph         $t5, $t5, 8            \n"                            \
+      "shll_s.ph         $t4, $t4, 8            \n"                            \
+      "shll_s.ph         $t9, $t9, 8            \n"                            \
+      "shll_s.ph         $t8, $t8, 8            \n"                            \
+      "shll_s.ph         $t2, $t2, 8            \n"                            \
+      "shll_s.ph         $t1, $t1, 8            \n"                            \
+      "shra.ph           $t5, $t5, 8            \n"                            \
+      "shra.ph           $t4, $t4, 8            \n"                            \
+      "shra.ph           $t9, $t9, 8            \n"                            \
+      "shra.ph           $t8, $t8, 8            \n"                            \
+      "shra.ph           $t2, $t2, 8            \n"                            \
+      "shra.ph           $t1, $t1, 8            \n"                            \
+      "addu.ph           $t5, $t5, $s5          \n"                            \
+      "addu.ph           $t4, $t4, $s5          \n"                            \
+      "addu.ph           $t9, $t9, $s5          \n"                            \
+      "addu.ph           $t8, $t8, $s5          \n"                            \
+      "addu.ph           $t2, $t2, $s5          \n"                            \
       "addu.ph           $t1, $t1, $s5          \n"
 
 void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
@@ -745,7 +752,9 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
     "repl.ph           $s5, 128               \n"  // |128|128| // clipping
     "lui               $s6, 0xff00            \n"
     "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
-    "1:                                         \n"
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
       I422ToTransientMipsRGB
 // Arranging into argb format
     "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
@@ -773,7 +782,7 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
     "sw                $t3, 12(%[rgb_buf])    \n"
     "bnez              %[width], 1b           \n"
     " addiu            %[rgb_buf], 16         \n"
-    "2:                                         \n"
+   "2:                                        \n"
     ".set pop                                 \n"
       :[y_buf] "+r" (y_buf),
        [u_buf] "+r" (u_buf),
@@ -794,47 +803,49 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
                               uint8* rgb_buf,
                               int width) {
   __asm__ __volatile__ (
-    ".set push                                \n\t"
-    ".set noreorder                           \n\t"
-    "beqz              %[width], 2f           \n\t"
-    " repl.ph          $s0, 74                \n\t"  // |YG|YG| = |74|74|
-    "repl.ph           $s1, -25               \n\t"  // |UG|UG| = |-25|-25|
-    "repl.ph           $s2, -52               \n\t"  // |VG|VG| = |-52|-52|
-    "repl.ph           $s3, 102               \n\t"  // |VR|VR| = |102|102|
-    "repl.ph           $s4, 16                \n\t"  // |0|16|0|16|
-    "repl.ph           $s5, 128               \n\t"  // |128|128|
-    "lui               $s6, 0xff00            \n\t"
-    "ori               $s6, 0xff00            \n\t"  // |ff|00|ff|00|
-    "1:                                         \n"
+    ".set push                                \n"
+    ".set noreorder                           \n"
+    "beqz              %[width], 2f           \n"
+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
+    "repl.ph           $s5, 128               \n"  // |128|128|
+    "lui               $s6, 0xff00            \n"
+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|
+
+    ".p2align          2                       \n"
+   "1:                                         \n"
       I422ToTransientMipsRGB
 // Arranging into abgr format
-    "precr.qb.ph      $t0, $t8, $t1           \n\t"  // |G1|g1|R1|r1|
-    "precr.qb.ph      $t3, $t9, $t2           \n\t"  // |G0|g0|R0|r0|
-    "precrq.qb.ph     $t8, $t0, $t3           \n\t"  // |G1|R1|G0|R0|
-    "precr.qb.ph      $t9, $t0, $t3           \n\t"  // |g1|r1|g0|r0|
-
-    "precr.qb.ph       $t2, $t4, $t5          \n\t"  // |B1|b1|B0|b0|
-    "addiu             %[width], -4           \n\t"
-    "addiu             %[y_buf], 4            \n\t"
-    "preceu.ph.qbla    $t1, $t2               \n\t"  // |0 |B1|0 |B0|
-    "preceu.ph.qbra    $t2, $t2               \n\t"  // |0 |b1|0 |b0|
-    "or                $t1, $t1, $s6          \n\t"  // |ff|B1|ff|B0|
-    "or                $t2, $t2, $s6          \n\t"  // |ff|b1|ff|b0|
-    "precrq.ph.w       $t0, $t2, $t9          \n\t"  // |ff|b1|g1|r1|
-    "precrq.ph.w       $t3, $t1, $t8          \n\t"  // |ff|B1|G1|R1|
-    "sll               $t9, $t9, 16           \n\t"
-    "sll               $t8, $t8, 16           \n\t"
-    "packrl.ph         $t2, $t2, $t9          \n\t"  // |ff|b0|g0|r0|
-    "packrl.ph         $t1, $t1, $t8          \n\t"  // |ff|B0|G0|R0|
+    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|
+    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|
+    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|
+    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|
+
+    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|
+    "addiu             %[width], -4           \n"
+    "addiu             %[y_buf], 4            \n"
+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|
+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|
+    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|
+    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|
+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|
+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|
+    "sll               $t9, $t9, 16           \n"
+    "sll               $t8, $t8, 16           \n"
+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|
+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|
 // Store results.
-    "sw                $t2, 0(%[rgb_buf])     \n\t"
-    "sw                $t0, 4(%[rgb_buf])     \n\t"
-    "sw                $t1, 8(%[rgb_buf])     \n\t"
-    "sw                $t3, 12(%[rgb_buf])    \n\t"
-    "bnez              %[width], 1b           \n\t"
-    " addiu            %[rgb_buf], 16         \n\t"
-    "2:                                         \n\t"
-    ".set pop                                 \n\t"
+    "sw                $t2, 0(%[rgb_buf])     \n"
+    "sw                $t0, 4(%[rgb_buf])     \n"
+    "sw                $t1, 8(%[rgb_buf])     \n"
+    "sw                $t3, 12(%[rgb_buf])    \n"
+    "bnez              %[width], 1b           \n"
+    " addiu            %[rgb_buf], 16         \n"
+   "2:                                        \n"
+    ".set pop                                 \n"
       :[y_buf] "+r" (y_buf),
        [u_buf] "+r" (u_buf),
        [v_buf] "+r" (v_buf),
@@ -865,13 +876,15 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
     "repl.ph           $s5, 128               \n"  // |128|128|
     "lui               $s6, 0xff              \n"
     "ori               $s6, 0xff              \n"  // |00|ff|00|ff|
-    "1:                                         \n"
+
+    ".p2align          2                      \n"
+   "1:                                        \n"
       I422ToTransientMipsRGB
       // Arranging into bgra format
-    "precr.qb.ph      $t4, $t4, $t8           \n"  // |B1|b1|G1|g1|
-    "precr.qb.ph      $t5, $t5, $t9           \n"  // |B0|b0|G0|g0|
-    "precrq.qb.ph     $t8, $t4, $t5           \n"  // |B1|G1|B0|G0|
-    "precr.qb.ph      $t9, $t4, $t5           \n"  // |b1|g1|b0|g0|
+    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|
+    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|
+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|
+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|
 
     "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
     "addiu             %[width], -4           \n"
@@ -895,7 +908,7 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
     "sw                $t3, 12(%[rgb_buf])    \n"
     "bnez              %[width], 1b           \n"
     " addiu            %[rgb_buf], 16         \n"
-    "2:                                         \n"
+   "2:                                        \n"
     ".set pop                                 \n"
       :[y_buf] "+r" (y_buf),
        [u_buf] "+r" (u_buf),
@@ -923,6 +936,8 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
 
      "replv.ph          $t0, %[y0_fraction]               \n"
      "replv.ph          $t1, %[source_y_fraction]         \n"
+
+    ".p2align           2                                 \n"
    "1:                                                    \n"
      "lw                $t2, 0(%[src_ptr])                \n"
      "lw                $t3, 0(%[src_ptr1])               \n"
@@ -968,6 +983,8 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // __mips_dsp_rev >= 2
 
+#endif  // defined(__mips__)
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc
index 0bb55e717be..5e802194b2b 100644
--- a/chromium/third_party/libyuv/source/row_neon.cc
+++ b/chromium/third_party/libyuv/source/row_neon.cc
@@ -102,10 +102,10 @@ extern "C" {
     "vtrn.u8    d16, d17                       \n"                             \
     "vmov.u8    d21, d16                       \n"
 
-static const vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
-                               0, 0, 0, 0, 0, 0, 0, 0 };
-static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
-                             0, 0, 0, 0, 0, 0, 0, 0 };
+static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,
+                         0, 0, 0, 0, 0, 0, 0, 0 };
+static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+                       0, 0, 0, 0, 0, 0, 0, 0 };
 
 void I444ToARGBRow_NEON(const uint8* src_y,
                         const uint8* src_u,
@@ -118,7 +118,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV444
     YUV422TORGB
@@ -149,7 +149,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -180,7 +180,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV411
     YUV422TORGB
@@ -211,7 +211,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -243,7 +243,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -275,7 +275,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -306,7 +306,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -336,7 +336,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -379,7 +379,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -425,7 +425,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -467,7 +467,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV422
     YUV422TORGB
@@ -497,7 +497,7 @@ void YToARGBRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUV400
     YUV422TORGB
@@ -519,7 +519,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
     "vmov.u8    d23, #255                      \n"
   "1:                                          \n"
     "vld1.8     {d20}, [%0]!                   \n"
@@ -546,7 +546,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READNV12
     YUV422TORGB
@@ -575,7 +575,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READNV21
     YUV422TORGB
@@ -604,7 +604,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READNV12
     YUV422TORGB
@@ -633,7 +633,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READNV21
     YUV422TORGB
@@ -661,7 +661,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READYUY2
     YUV422TORGB
@@ -688,7 +688,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
     "vmov.u8    d26, #128                      \n"
     "vmov.u16   q14, #74                       \n"
     "vmov.u16   q15, #16                       \n"
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     READUYVY
     YUV422TORGB
@@ -710,7 +710,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
                      int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
     "subs       %3, %3, #16                    \n"  // 16 processed per loop
@@ -730,7 +730,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load U
     "vld1.8     {q1}, [%1]!                    \n"  // load V
@@ -750,7 +750,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
     "subs       %2, %2, #32                    \n"  // 32 processed per loop
@@ -796,7 +796,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     "add        %0, %0, %2                     \n"
     "sub        %0, #16                        \n"
 
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
     "subs       %2, #16                        \n"  // 16 pixels per loop.
@@ -820,7 +820,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "add        %0, %0, %3, lsl #1             \n"
     "sub        %0, #16                        \n"
 
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
     "subs       %3, #8                         \n"  // 8 pixels per loop.
@@ -844,7 +844,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
     "add        %0, %0, %2, lsl #2             \n"
     "sub        %0, #16                        \n"
 
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
     "subs       %2, #4                         \n"  // 4 pixels per loop.
@@ -863,7 +863,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   asm volatile (
     "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -880,7 +880,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
   asm volatile (
     "vmov.u8    d4, #255                       \n"  // Alpha
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -910,7 +910,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -956,7 +956,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
                             int pix) {
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -985,7 +985,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
                             int pix) {
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // Alpha
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1002,7 +1002,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
 
 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1018,7 +1018,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
 
 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1035,7 +1035,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
 
 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
@@ -1051,7 +1051,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
 
 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
     "subs       %2, %2, #16                    \n"  // 16 processed per loop.
@@ -1068,7 +1068,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
@@ -1087,7 +1087,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
                          int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
@@ -1107,7 +1107,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // stride + src_yuy2
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
@@ -1131,7 +1131,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int pix) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // stride + src_uyvy
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
     "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
@@ -1193,6 +1193,23 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
   );
 }
 
+// Select G channels from ARGB.  e.g.  GGGGGGGG
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+                           uint32 /*selector*/, int pix) {
+  asm volatile (
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_bayer),  // %1
+    "+r"(pix)         // %2
+  :
+  : "cc", "memory", "q0", "q1"  // Clobber List
+  );
+}
+
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
@@ -1218,7 +1235,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_yuy2, int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
     "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
@@ -1241,7 +1258,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
                         const uint8* src_v,
                         uint8* dst_uyvy, int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
     "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
@@ -1261,7 +1278,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
 
 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1279,7 +1296,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
                             int pix) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1298,7 +1315,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
                             int pix) {
   asm volatile (
     "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1319,7 +1336,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1343,7 +1360,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
     "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
     "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1371,7 +1388,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
     "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %3, %3, #8                     \n"  // 8 processed per loop.
@@ -1410,7 +1427,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
@@ -1456,7 +1473,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
@@ -1521,7 +1538,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
@@ -1560,7 +1577,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.s16   q13, #20 / 4                   \n"  // VB -0.08131 coefficient
     "vmov.s16   q14, #107 / 4                  \n"  // VG -0.41869 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
@@ -1598,7 +1615,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
@@ -1636,7 +1653,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
@@ -1674,7 +1691,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
@@ -1712,7 +1729,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
@@ -1750,7 +1767,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
@@ -1789,7 +1806,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     RGB565TOARGB
@@ -1849,7 +1866,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     RGB555TOARGB
@@ -1909,7 +1926,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
     "vmov.s16   q13, #18 / 4                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 4                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     ARGB4444TOARGB
@@ -1964,7 +1981,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -1990,7 +2007,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2016,7 +2033,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
     "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
     "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
     "vmov.u8    d27, #16                       \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2042,7 +2059,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2067,7 +2084,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2092,7 +2109,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
     "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2117,7 +2134,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
     "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2142,7 +2159,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
     "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
     "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
     "vmov.u8    d7, #16                        \n"  // Add 16 constant
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2412,7 +2429,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
     "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
     "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
     "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "subs       %2, %2, #8                     \n"  // 8 processed per loop.
@@ -2447,7 +2464,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
     "vmov.u8    d28, #24                       \n"  // BB coefficient
     "vmov.u8    d29, #98                       \n"  // BG coefficient
     "vmov.u8    d30, #50                       \n"  // BR coefficient
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
     "subs       %1, %1, #8                     \n"  // 8 processed per loop.
@@ -2474,18 +2491,19 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
 }
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
-                             int width) {
+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
+// needs to saturate.  Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+                             const int8* matrix_argb, int width) {
   asm volatile (
-    "vld1.8     {q2}, [%2]                     \n"  // load 3 ARGB vectors.
+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
     "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R coefficients s16.
+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
 
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
-    "vld4.8     {d16, d18, d20, d22}, [%0]     \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
     "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
     "vmovl.u8   q9, d18                        \n"  // g
     "vmovl.u8   q10, d20                       \n"  // r
@@ -2493,33 +2511,42 @@ void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
     "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
     "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
     "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
     "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
     "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
     "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
     "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
     "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
     "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
     "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B
     "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G
     "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R
+    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A
     "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
     "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
     "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqshrun.s16 d16, q12, #7                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #7                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #7                  \n"  // 16 bit to 8 bit R
-    "vst4.8     {d16, d18, d20, d22}, [%0]!    \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(width)       // %1
-  : "r"(matrix_argb)  // %2
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9",
+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  : "r"(matrix_argb)  // %3
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
     "q10", "q11", "q12", "q13", "q14", "q15"
   );
 }
@@ -2531,7 +2558,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
     "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
@@ -2562,7 +2589,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
@@ -2586,7 +2613,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
     // 8 pixel loop.
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
     "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
@@ -2615,7 +2642,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // alpha
     // 8 pixel loop.
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
     "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
@@ -2627,8 +2654,30 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
     "bgt        1b                             \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
+    "+r"(dst_argb),    // %2
+    "+r"(width)        // %3
+  :
+  : "cc", "memory", "q0", "q1"
+  );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    // 16 pixel loop.
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+    "vqadd.u8   q0, q0, q1                     \n"  // add
+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+    "bgt        1b                             \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
   :
   : "cc", "memory", "q0", "q1"
   );
@@ -2644,7 +2693,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
   asm volatile (
     "vmov.u8    d3, #255                       \n"  // alpha
     // 8 pixel loop.
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
     "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
@@ -2668,7 +2717,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobelx, int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {d0}, [%0],%5                  \n"  // top
     "vld1.8     {d1}, [%0],%6                  \n"
@@ -2705,7 +2754,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) {
   asm volatile (
-    ".p2align  2                               \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {d0}, [%0],%4                  \n"  // left
     "vld1.8     {d1}, [%1],%4                  \n"
diff --git a/chromium/third_party/libyuv/source/row_posix.cc b/chromium/third_party/libyuv/source/row_posix.cc
index b92a9f5c13b..539d871535b 100644
--- a/chromium/third_party/libyuv/source/row_posix.cc
+++ b/chromium/third_party/libyuv/source/row_posix.cc
@@ -10,155 +10,292 @@
 
 #include "libyuv/row.h"
 
-#include "libyuv/basic_types.h"
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for GCC x86 and x64
+// This module is for GCC x86 and x64.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
+// TODO(nfullagar): For Native Client: When new toolchain becomes available,
+// take advantage of bundle lock / unlock feature. This will reduce the amount
+// of manual bundle alignment done below, and bundle alignment could even be
+// moved into each macro that doesn't use %%nacl: such as MEMOPREG. Consider
+// unmunging functions to reduce complex addressing modes.
+
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14)"
+#define BUNDLEALIGN ".p2align 5\n"
 #else
-#define CONST static const
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale ")"
+#define BUNDLEALIGN
 #endif
 
-#ifdef HAS_ARGBTOYROW_SSSE3
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 // Constants for ARGB
-CONST vec8 kARGBToY = {
+static vec8 kARGBToY = {
   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
 };
 
 // JPeg full range.
-CONST vec8 kARGBToYJ = {
+static vec8 kARGBToYJ = {
   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
 };
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
-CONST vec8 kARGBToU = {
+static vec8 kARGBToU = {
   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
 
-CONST vec8 kARGBToUJ = {
+static vec8 kARGBToUJ = {
   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
 };
 
-CONST vec8 kARGBToV = {
+static vec8 kARGBToV = {
   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
-CONST vec8 kARGBToVJ = {
+static vec8 kARGBToVJ = {
   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
 };
 
 // Constants for BGRA
-CONST vec8 kBGRAToY = {
+static vec8 kBGRAToY = {
   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
 };
 
-CONST vec8 kBGRAToU = {
+static vec8 kBGRAToU = {
   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
 };
 
-CONST vec8 kBGRAToV = {
+static vec8 kBGRAToV = {
   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
 };
 
 // Constants for ABGR
-CONST vec8 kABGRToY = {
+static vec8 kABGRToY = {
   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
 };
 
-CONST vec8 kABGRToU = {
+static vec8 kABGRToU = {
   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
 };
 
-CONST vec8 kABGRToV = {
+static vec8 kABGRToV = {
   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
 };
 
 // Constants for RGBA.
-CONST vec8 kRGBAToY = {
+static vec8 kRGBAToY = {
   0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
 };
 
-CONST vec8 kRGBAToU = {
+static vec8 kRGBAToU = {
   0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
 };
 
-CONST vec8 kRGBAToV = {
+static vec8 kRGBAToV = {
   0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
 };
 
-CONST uvec8 kAddY16 = {
+static uvec8 kAddY16 = {
   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
 };
 
-CONST vec16 kAddYJ64 = {
+static vec16 kAddYJ64 = {
   64, 64, 64, 64, 64, 64, 64, 64
 };
 
-CONST uvec8 kAddUV128 = {
+static uvec8 kAddUV128 = {
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
 };
 
-CONST uvec16 kAddUVJ128 = {
+static uvec16 kAddUVJ128 = {
   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
 };
+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
 
 // Shuffle table for converting RGB24 to ARGB.
-CONST uvec8 kShuffleMaskRGB24ToARGB = {
+static uvec8 kShuffleMaskRGB24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
 };
 
 // Shuffle table for converting RAW to ARGB.
-CONST uvec8 kShuffleMaskRAWToARGB = {
+static uvec8 kShuffleMaskRAWToARGB = {
   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
 };
 
 // Shuffle table for converting ARGB to RGB24.
-CONST uvec8 kShuffleMaskARGBToRGB24 = {
+static uvec8 kShuffleMaskARGBToRGB24 = {
   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
 };
 
 // Shuffle table for converting ARGB to RAW.
-CONST uvec8 kShuffleMaskARGBToRAW = {
+static uvec8 kShuffleMaskARGBToRAW = {
   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
 };
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
   0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
 };
 
 // Shuffle table for converting ARGB to RAW.
-CONST uvec8 kShuffleMaskARGBToRAW_0 = {
+static uvec8 kShuffleMaskARGBToRAW_0 = {
   2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
 };
+#endif  // HAS_RGB24TOARGBROW_SSSE3
+
+#if defined(TESTING) && defined(__x86_64__)
+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+  asm volatile (
+    ".p2align  5                               \n"
+    "mov       %%eax,%%eax                     \n"
+    "mov       %%ebx,%%ebx                     \n"
+    "mov       %%ecx,%%ecx                     \n"
+    "mov       %%edx,%%edx                     \n"
+    "mov       %%esi,%%esi                     \n"
+    "mov       %%edi,%%edi                     \n"
+    "mov       %%ebp,%%ebp                     \n"
+    "mov       %%esp,%%esp                     \n"
+    ".p2align  5                               \n"
+    "mov       %%r8d,%%r8d                     \n"
+    "mov       %%r9d,%%r9d                     \n"
+    "mov       %%r10d,%%r10d                   \n"
+    "mov       %%r11d,%%r11d                   \n"
+    "mov       %%r12d,%%r12d                   \n"
+    "mov       %%r13d,%%r13d                   \n"
+    "mov       %%r14d,%%r14d                   \n"
+    "mov       %%r15d,%%r15d                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%rax),%%eax                   \n"
+    "lea       (%%rbx),%%ebx                   \n"
+    "lea       (%%rcx),%%ecx                   \n"
+    "lea       (%%rdx),%%edx                   \n"
+    "lea       (%%rsi),%%esi                   \n"
+    "lea       (%%rdi),%%edi                   \n"
+    "lea       (%%rbp),%%ebp                   \n"
+    "lea       (%%rsp),%%esp                   \n"
+    ".p2align  5                               \n"
+    "lea       (%%r8),%%r8d                    \n"
+    "lea       (%%r9),%%r9d                    \n"
+    "lea       (%%r10),%%r10d                  \n"
+    "lea       (%%r11),%%r11d                  \n"
+    "lea       (%%r12),%%r12d                  \n"
+    "lea       (%%r13),%%r13d                  \n"
+    "lea       (%%r14),%%r14d                  \n"
+    "lea       (%%r15),%%r15d                  \n"
+
+    ".p2align  5                               \n"
+    "lea       0x10(%%rax),%%eax               \n"
+    "lea       0x10(%%rbx),%%ebx               \n"
+    "lea       0x10(%%rcx),%%ecx               \n"
+    "lea       0x10(%%rdx),%%edx               \n"
+    "lea       0x10(%%rsi),%%esi               \n"
+    "lea       0x10(%%rdi),%%edi               \n"
+    "lea       0x10(%%rbp),%%ebp               \n"
+    "lea       0x10(%%rsp),%%esp               \n"
+    ".p2align  5                               \n"
+    "lea       0x10(%%r8),%%r8d                \n"
+    "lea       0x10(%%r9),%%r9d                \n"
+    "lea       0x10(%%r10),%%r10d              \n"
+    "lea       0x10(%%r11),%%r11d              \n"
+    "lea       0x10(%%r12),%%r12d              \n"
+    "lea       0x10(%%r13),%%r13d              \n"
+    "lea       0x10(%%r14),%%r14d              \n"
+    "lea       0x10(%%r15),%%r15d              \n"
+
+    ".p2align  5                               \n"
+    "add       0x10,%%eax                      \n"
+    "add       0x10,%%ebx                      \n"
+    "add       0x10,%%ecx                      \n"
+    "add       0x10,%%edx                      \n"
+    "add       0x10,%%esi                      \n"
+    "add       0x10,%%edi                      \n"
+    "add       0x10,%%ebp                      \n"
+    "add       0x10,%%esp                      \n"
+    ".p2align  5                               \n"
+    "add       0x10,%%r8d                      \n"
+    "add       0x10,%%r9d                      \n"
+    "add       0x10,%%r10d                     \n"
+    "add       0x10,%%r11d                     \n"
+    "add       0x10,%%r12d                     \n"
+    "add       0x10,%%r13d                     \n"
+    "add       0x10,%%r14d                     \n"
+    "add       0x10,%%r15d                     \n"
+
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_y),     // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+#endif  // TESTING
 
+#ifdef HAS_I400TOARGBROW_SSE2
 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0x18,%%xmm5                    \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm0,%%xmm0                   \n"
     "punpckhwd %%xmm1,%%xmm1                   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "por       %%xmm5,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_y),     // %0
@@ -177,19 +314,19 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0x18,%%xmm5                    \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm0,%%xmm0                   \n"
     "punpckhwd %%xmm1,%%xmm1                   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "por       %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src_y),     // %0
@@ -202,36 +339,39 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
 #endif
   );
 }
+#endif  // HAS_I400TOARGBROW_SSE2
 
+#ifdef HAS_RGB24TOARGBROW_SSSE3
 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm3                 \n"
-    "lea       0x30(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
     "movdqa    %%xmm3,%%xmm2                   \n"
     "palignr   $0x8,%%xmm1,%%xmm2              \n"
     "pshufb    %%xmm4,%%xmm2                   \n"
     "por       %%xmm5,%%xmm2                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "palignr   $0x4,%%xmm3,%%xmm3              \n"
     "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "por       %%xmm5,%%xmm3                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
+    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_rgb24),  // %0
     "+r"(dst_argb),  // %1
@@ -249,30 +389,31 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
     "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
     "pslld     $0x18,%%xmm5                    \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm3                 \n"
-    "lea       0x30(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x30,0) ",%0           \n"
     "movdqa    %%xmm3,%%xmm2                   \n"
     "palignr   $0x8,%%xmm1,%%xmm2              \n"
     "pshufb    %%xmm4,%%xmm2                   \n"
     "por       %%xmm5,%%xmm2                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "palignr   $0x4,%%xmm3,%%xmm3              \n"
     "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     "por       %%xmm5,%%xmm3                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm3,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
+    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_raw),   // %0
     "+r"(dst_argb),  // %1
@@ -302,9 +443,10 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "psllw     $0x8,%%xmm7                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "pand      %%xmm3,%%xmm1                   \n"
@@ -319,9 +461,10 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpckhbw %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm1,(%1,%0,2)                \n"
-    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -329,6 +472,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "+r"(pix)   // %2
   :
   : "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
@@ -351,9 +497,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "psllw     $0x8,%%xmm7                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "psllw     $0x1,%%xmm1                     \n"
@@ -372,9 +519,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm1,%%xmm2                   \n"
     "punpcklbw %%xmm0,%%xmm1                   \n"
     "punpckhbw %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm1,(%1,%0,2)                \n"
-    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
+    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -382,6 +530,9 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "+r"(pix)   // %2
   :
   : "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
@@ -397,9 +548,10 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0x4,%%xmm5                     \n"
     "sub       %0,%1                           \n"
     "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "pand      %%xmm4,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm2                   \n"
@@ -412,9 +564,10 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1,%0,2)                \n"
-    "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
-    "lea       0x10(%0),%0                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)
+    MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -422,6 +575,9 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
     "+r"(pix)   // %2
   :
   : "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -431,13 +587,14 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "pshufb    %%xmm6,%%xmm0                   \n"
     "pshufb    %%xmm6,%%xmm1                   \n"
     "pshufb    %%xmm6,%%xmm2                   \n"
@@ -448,14 +605,14 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm2,%%xmm5                   \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pslldq    $0x8,%%xmm5                     \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "psrldq    $0x8,%%xmm2                     \n"
     "pslldq    $0x4,%%xmm3                     \n"
     "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
-    "lea       0x30(%1),%1                     \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
     "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -472,13 +629,14 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm6                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "pshufb    %%xmm6,%%xmm0                   \n"
     "pshufb    %%xmm6,%%xmm1                   \n"
     "pshufb    %%xmm6,%%xmm2                   \n"
@@ -489,14 +647,14 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
     "movdqa    %%xmm2,%%xmm5                   \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pslldq    $0x8,%%xmm5                     \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     "por       %%xmm5,%%xmm1                   \n"
     "psrldq    $0x8,%%xmm2                     \n"
     "pslldq    $0x4,%%xmm3                     \n"
     "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
-    "lea       0x30(%1),%1                     \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x30,1) ",%1           \n"
     "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -519,9 +677,10 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0x5,%%xmm4                     \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0xb,%%xmm5                     \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "pslld     $0x8,%%xmm0                     \n"
@@ -534,9 +693,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "por       %%xmm2,%%xmm1                   \n"
     "por       %%xmm1,%%xmm0                   \n"
     "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -560,9 +719,10 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "pslld     $0xa,%%xmm6                     \n"
     "pcmpeqb   %%xmm7,%%xmm7                   \n"
     "pslld     $0xf,%%xmm7                     \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm3                   \n"
@@ -578,9 +738,9 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "por       %%xmm3,%%xmm2                   \n"
     "por       %%xmm2,%%xmm0                   \n"
     "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMACCESS2(0x8,1) ",%1        \n"
     "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -600,9 +760,10 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "psllw     $0xc,%%xmm4                     \n"
     "movdqa    %%xmm4,%%xmm3                   \n"
     "psrlw     $0x8,%%xmm3                     \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "pand      %%xmm3,%%xmm0                   \n"
     "pand      %%xmm4,%%xmm1                   \n"
@@ -610,9 +771,9 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
     "psrlq     $0x8,%%xmm1                     \n"
     "por       %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "lea       0x10(%0),%0                     \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x4,%2                         \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
@@ -625,22 +786,24 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
 #endif
   );
 }
+#endif  // HAS_RGB24TOARGBROW_SSSE3
 
+#ifdef HAS_ARGBTOYROW_SSSE3
 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
@@ -648,8 +811,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
@@ -663,74 +826,76 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   );
 }
 
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
-    ".p2align  4                               \n"
+    "movdqa    %3,%%xmm4                       \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
+    "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
+  : "m"(kARGBToY),   // %3
+    "m"(kAddY16)     // %4
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
   );
 }
+#endif  // HAS_ARGBTOYROW_SSSE3
 
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+#ifdef HAS_ARGBTOYJROW_SSSE3
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    "movdqa    %4,%%xmm5                       \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
+    "paddw     %%xmm5,%%xmm0                   \n"
+    "paddw     %%xmm5,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
     "+r"(pix)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
+  : "m"(kARGBToYJ),  // %3
+    "m"(kAddYJ64)    // %4
   : "memory", "cc"
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -742,17 +907,17 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "paddw     %%xmm5,%%xmm0                   \n"
@@ -761,8 +926,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     "psrlw     $0x7,%%xmm2                     \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_y),     // %1
@@ -775,7 +940,9 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
 #endif
   );
 }
+#endif  // HAS_ARGBTOYJROW_SSSE3
 
+#ifdef HAS_ARGBTOUVROW_SSSE3
 // TODO(fbarchard): pass xmm constants to single block of assembly.
 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
@@ -794,17 +961,19 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -826,16 +995,20 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_argb))
+  : "r"(static_cast<intptr_t>(src_stride_argb)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -856,17 +1029,19 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -889,16 +1064,20 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_argb))
+  : "r"(static_cast<intptr_t>(src_stride_argb)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -918,21 +1097,23 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -954,16 +1135,20 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_argb))
+  : "r"(static_cast<intptr_t>(src_stride_argb)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -983,21 +1168,23 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1020,9 +1207,10 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "psraw     $0x8,%%xmm1                     \n"
     "packsswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
@@ -1030,6 +1218,9 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     "+rm"(width)           // %3
   : "r"(static_cast<intptr_t>(src_stride_argb))
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1049,12 +1240,13 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
@@ -1066,11 +1258,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm3,%%xmm0                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
     "pmaddubsw %%xmm3,%%xmm2                   \n"
@@ -1081,9 +1273,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "psraw     $0x8,%%xmm2                     \n"
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqa    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),        // %0
     "+r"(dst_u),           // %1
@@ -1091,6 +1284,9 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
     "+rm"(width)           // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6"
 #endif
@@ -1110,12 +1306,13 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
@@ -1127,11 +1324,11 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm3,%%xmm0                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
     "pmaddubsw %%xmm3,%%xmm2                   \n"
@@ -1142,9 +1339,10 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
     "psraw     $0x8,%%xmm2                     \n"
     "packsswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       0x40(%0),%0                     \n"
-    "movdqu    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),        // %0
     "+r"(dst_u),           // %1
@@ -1152,6 +1350,9 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
     "+rm"(width)           // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6"
 #endif
@@ -1171,13 +1372,14 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1199,9 +1401,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
@@ -1209,6 +1412,9 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
     "+rm"(width)           // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1228,13 +1434,14 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1256,9 +1463,10 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),       // %0
     "+r"(dst_u),           // %1
@@ -1266,6 +1474,9 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
     "+rm"(width)           // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1276,17 +1487,18 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
@@ -1294,8 +1506,8 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
@@ -1313,17 +1525,18 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
@@ -1331,8 +1544,8 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_bgra),  // %0
     "+r"(dst_y),     // %1
@@ -1359,17 +1572,19 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1391,16 +1606,20 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_bgra0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_bgra))
+  : "r"(static_cast<intptr_t>(src_stride_bgra)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1420,21 +1639,23 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1456,16 +1677,20 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_bgra0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_bgra))
+  : "r"(static_cast<intptr_t>(src_stride_bgra)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1476,17 +1701,18 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
@@ -1494,8 +1720,8 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_abgr),  // %0
     "+r"(dst_y),     // %1
@@ -1513,17 +1739,18 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
@@ -1531,8 +1758,8 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_abgr),  // %0
     "+r"(dst_y),     // %1
@@ -1550,17 +1777,18 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
@@ -1568,8 +1796,8 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_rgba),  // %0
     "+r"(dst_y),     // %1
@@ -1587,17 +1815,18 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
   asm volatile (
     "movdqa    %4,%%xmm5                       \n"
     "movdqa    %3,%%xmm4                       \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm3                 \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "pmaddubsw %%xmm4,%%xmm2                   \n"
     "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "phaddw    %%xmm3,%%xmm2                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
@@ -1605,8 +1834,8 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
     "packuswb  %%xmm2,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_rgba),  // %0
     "+r"(dst_y),     // %1
@@ -1633,17 +1862,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1665,16 +1896,20 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_abgr0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_abgr))
+  : "r"(static_cast<intptr_t>(src_stride_abgr)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1694,21 +1929,23 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1730,16 +1967,20 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_abgr0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_abgr))
+  : "r"(static_cast<intptr_t>(src_stride_abgr)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1759,17 +2000,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm6                 \n"
-    "pavgb     (%0,%4,1),%%xmm0                \n"
-    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
-    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
-    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
-    "lea       0x40(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1791,9 +2034,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_rgba0),       // %0
     "+r"(dst_u),           // %1
@@ -1801,6 +2045,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     "+rm"(width)           // %3
   : "r"(static_cast<intptr_t>(src_stride_rgba))
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
@@ -1820,21 +2067,23 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
   );
   asm volatile (
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    0x20(%0),%%xmm2                 \n"
-    "movdqu    0x30(%0),%%xmm6                 \n"
-    "movdqu    (%0,%4,1),%%xmm7                \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
     "pavgb     %%xmm7,%%xmm6                   \n"
-    "lea       0x40(%0),%0                     \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm7                   \n"
     "shufps    $0x88,%%xmm1,%%xmm0             \n"
     "shufps    $0xdd,%%xmm1,%%xmm7             \n"
@@ -1856,22 +2105,26 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
     "packsswb  %%xmm1,%%xmm0                   \n"
     "paddb     %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movlps    %%xmm0,(%1)                     \n"
-    "movhps    %%xmm0,(%1,%2,1)                \n"
-    "lea       0x8(%1),%1                      \n"
+    "movlps    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_rgba0),       // %0
     "+r"(dst_u),           // %1
     "+r"(dst_v),           // %2
     "+rm"(width)           // %3
-  : "r"(static_cast<intptr_t>(src_stride_rgba))
+  : "r"(static_cast<intptr_t>(src_stride_rgba)) // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
 #endif
   );
 }
-#endif  // HAS_ARGBTOYROW_SSSE3
+#endif  // HAS_ARGBTOUVROW_SSSE3
 
 #ifdef HAS_I422TOARGBROW_SSSE3
 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
@@ -1901,7 +2154,7 @@ struct {
   vec8 kVUToB;  // 128
   vec8 kVUToG;  // 144
   vec8 kVUToR;  // 160
-} CONST SIMD_ALIGNED(kYuvConstants) = {
+} static SIMD_ALIGNED(kYuvConstants) = {
   { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
   { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
   { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
@@ -1918,83 +2171,86 @@ struct {
 
 // Read 8 UV from 411
 #define READYUV444                                                             \
-    "movq       (%[u_buf]),%%xmm0              \n"                             \
-    "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
-    "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
-    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
+    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    BUNDLEALIGN                                                                \
+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"
 
 // Read 4 UV from 422, upsample to 8 UV
 #define READYUV422                                                             \
-    "movd       (%[u_buf]),%%xmm0              \n"                             \
-    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
-    "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
-    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
-    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    BUNDLEALIGN                                                                \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
 
 // Read 2 UV from 411, upsample to 8 UV
 #define READYUV411                                                             \
-    "movd       (%[u_buf]),%%xmm0              \n"                             \
-    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
-    "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
-    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
-    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
-    "punpckldq  %%xmm0,%%xmm0                  \n"                             \
+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
+    BUNDLEALIGN                                                                \
+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
+    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
+    "punpckldq  %%xmm0,%%xmm0                                   \n"
 
 // Read 4 UV from NV12, upsample to 8 UV
 #define READNV12                                                               \
-    "movq       (%[uv_buf]),%%xmm0             \n"                             \
-    "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
-    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
+    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
+    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
+    "punpcklwd  %%xmm0,%%xmm0                                   \n"
 
 // Convert 8 pixels: 8 UV and 8 Y
 #define YUVTORGB                                                               \
-    "movdqa     %%xmm0,%%xmm1                  \n"                             \
-    "movdqa     %%xmm0,%%xmm2                  \n"                             \
-    "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
-    "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
-    "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
-    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
-    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
-    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
-    "movq       (%[y_buf]),%%xmm3              \n"                             \
-    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
-    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
-    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
-    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
-    "paddsw     %%xmm3,%%xmm0                  \n"                             \
-    "paddsw     %%xmm3,%%xmm1                  \n"                             \
-    "paddsw     %%xmm3,%%xmm2                  \n"                             \
-    "psraw      $0x6,%%xmm0                    \n"                             \
-    "psraw      $0x6,%%xmm1                    \n"                             \
-    "psraw      $0x6,%%xmm2                    \n"                             \
-    "packuswb   %%xmm0,%%xmm0                  \n"                             \
-    "packuswb   %%xmm1,%%xmm1                  \n"                             \
-    "packuswb   %%xmm2,%%xmm2                  \n"                             \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \
+    "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \
+    "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \
+    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
+    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
+    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
+    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
+    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
+    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
+    "paddsw     %%xmm3,%%xmm0                                   \n"            \
+    "paddsw     %%xmm3,%%xmm1                                   \n"            \
+    "paddsw     %%xmm3,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
 
 // Convert 8 pixels: 8 VU and 8 Y
 #define YVUTORGB                                                               \
-    "movdqa     %%xmm0,%%xmm1                  \n"                             \
-    "movdqa     %%xmm0,%%xmm2                  \n"                             \
-    "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
-    "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
-    "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
-    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
-    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
-    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
-    "movq       (%[y_buf]),%%xmm3              \n"                             \
-    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
-    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
-    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
-    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
-    "paddsw     %%xmm3,%%xmm0                  \n"                             \
-    "paddsw     %%xmm3,%%xmm1                  \n"                             \
-    "paddsw     %%xmm3,%%xmm2                  \n"                             \
-    "psraw      $0x6,%%xmm0                    \n"                             \
-    "psraw      $0x6,%%xmm1                    \n"                             \
-    "psraw      $0x6,%%xmm2                    \n"                             \
-    "packuswb   %%xmm0,%%xmm0                  \n"                             \
-    "packuswb   %%xmm1,%%xmm1                  \n"                             \
-    "packuswb   %%xmm2,%%xmm2                  \n"                             \
+    "movdqa     %%xmm0,%%xmm1                                   \n"            \
+    "movdqa     %%xmm0,%%xmm2                                   \n"            \
+    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
+    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
+    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
+    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
+    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
+    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
+    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
+    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
+    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
+    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
+    "paddsw     %%xmm3,%%xmm0                                   \n"            \
+    "paddsw     %%xmm3,%%xmm1                                   \n"            \
+    "paddsw     %%xmm3,%%xmm2                                   \n"            \
+    "psraw      $0x6,%%xmm0                                     \n"            \
+    "psraw      $0x6,%%xmm1                                     \n"            \
+    "psraw      $0x6,%%xmm2                                     \n"            \
+    "packuswb   %%xmm0,%%xmm0                                   \n"            \
+    "packuswb   %%xmm1,%%xmm1                                   \n"            \
+    "packuswb   %%xmm2,%%xmm2                                   \n"
 
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                 const uint8* u_buf,
@@ -2005,7 +2261,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV444
     YUVTORGB
@@ -2014,9 +2270,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[dst_argb])            \n"
-    "movdqa    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2026,6 +2282,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2053,7 +2312,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
 #endif
     "sub       %[u_buf],%[v_buf]               \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2065,9 +2324,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm6,%%xmm1                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0,(%[dst_rgb24])           \n"
-    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
-    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2081,6 +2340,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
 #endif
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 #endif
@@ -2108,7 +2370,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
 #endif
     "sub       %[u_buf],%[v_buf]               \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2120,9 +2382,9 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm6,%%xmm1                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0,(%[dst_raw])             \n"
-    "movdqu    %%xmm1,0x8(%[dst_raw])          \n"
-    "lea       0x18(%[dst_raw]),%[dst_raw]     \n"
+    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
+    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2136,6 +2398,9 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
     [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
 #endif
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 #endif
@@ -2151,7 +2416,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2160,9 +2425,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[dst_argb])            \n"
-    "movdqa    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2172,6 +2437,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2187,7 +2455,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV411
     YUVTORGB
@@ -2196,9 +2464,9 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[dst_argb])            \n"
-    "movdqa    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2208,6 +2476,9 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2221,7 +2492,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READNV12
     YUVTORGB
@@ -2230,9 +2501,9 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[dst_argb])            \n"
-    "movdqa    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2241,6 +2512,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+  // Does not use r14.
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2254,7 +2526,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READNV12
     YVUTORGB
@@ -2263,9 +2535,9 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%[dst_argb])            \n"
-    "movdqa    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2274,6 +2546,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+  // Does not use r14.
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2289,7 +2562,7 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV444
     YUVTORGB
@@ -2298,9 +2571,9 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[dst_argb])            \n"
-    "movdqu    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2310,6 +2583,9 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2325,7 +2601,7 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2334,9 +2610,9 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[dst_argb])            \n"
-    "movdqu    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2346,6 +2622,9 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2361,7 +2640,7 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV411
     YUVTORGB
@@ -2370,9 +2649,9 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[dst_argb])            \n"
-    "movdqu    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2382,6 +2661,9 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2395,7 +2677,7 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READNV12
     YUVTORGB
@@ -2404,9 +2686,9 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[dst_argb])            \n"
-    "movdqu    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2415,6 +2697,7 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+  // Does not use r14.
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2428,7 +2711,7 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READNV12
     YVUTORGB
@@ -2437,9 +2720,9 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklwd %%xmm2,%%xmm0                   \n"
     "punpckhwd %%xmm2,%%xmm1                   \n"
-    "movdqu    %%xmm0,(%[dst_argb])            \n"
-    "movdqu    %%xmm1,0x10(%[dst_argb])        \n"
-    "lea       0x20(%[dst_argb]),%[dst_argb]   \n"
+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2448,6 +2731,7 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+  // Does not use r14.
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2463,7 +2747,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2473,9 +2757,9 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm5,%%xmm0                   \n"
     "punpcklwd %%xmm1,%%xmm5                   \n"
     "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5,(%[dst_bgra])            \n"
-    "movdqa    %%xmm0,0x10(%[dst_bgra])        \n"
-    "lea       0x20(%[dst_bgra]),%[dst_bgra]   \n"
+    "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"
+    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
+    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2485,6 +2769,9 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2500,7 +2787,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2509,9 +2796,9 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm2,%%xmm1                   \n"
     "punpcklwd %%xmm0,%%xmm2                   \n"
     "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,(%[dst_abgr])            \n"
-    "movdqa    %%xmm1,0x10(%[dst_abgr])        \n"
-    "lea       0x20(%[dst_abgr]),%[dst_abgr]   \n"
+    "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
+    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2521,6 +2808,9 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2536,7 +2826,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2546,9 +2836,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm5,%%xmm0                   \n"
     "punpcklwd %%xmm1,%%xmm5                   \n"
     "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5,(%[dst_rgba])            \n"
-    "movdqa    %%xmm0,0x10(%[dst_rgba])        \n"
-    "lea       0x20(%[dst_rgba]),%[dst_rgba]   \n"
+    "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"
+    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
+    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2558,6 +2848,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2573,7 +2866,7 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2583,9 +2876,9 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm5,%%xmm0                   \n"
     "punpcklwd %%xmm1,%%xmm5                   \n"
     "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm5,(%[dst_bgra])            \n"
-    "movdqu    %%xmm0,0x10(%[dst_bgra])        \n"
-    "lea       0x20(%[dst_bgra]),%[dst_bgra]   \n"
+    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"
+    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
+    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2595,6 +2888,9 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2610,7 +2906,7 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2619,9 +2915,9 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm2,%%xmm1                   \n"
     "punpcklwd %%xmm0,%%xmm2                   \n"
     "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,(%[dst_abgr])            \n"
-    "movdqu    %%xmm1,0x10(%[dst_abgr])        \n"
-    "lea       0x20(%[dst_abgr]),%[dst_abgr]   \n"
+    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
+    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2631,6 +2927,9 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2646,7 +2945,7 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pxor      %%xmm4,%%xmm4                   \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
     READYUV422
     YUVTORGB
@@ -2656,9 +2955,9 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
     "movdqa    %%xmm5,%%xmm0                   \n"
     "punpcklwd %%xmm1,%%xmm5                   \n"
     "punpckhwd %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm5,(%[dst_rgba])            \n"
-    "movdqa    %%xmm0,0x10(%[dst_rgba])        \n"
-    "lea       0x20(%[dst_rgba]),%[dst_rgba]   \n"
+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"
+    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
+    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
     "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
@@ -2668,6 +2967,9 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -2690,11 +2992,12 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     "mov       $0x004a004a,%%eax               \n"
     "movd      %%eax,%%xmm2                    \n"
     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    "movq      (%0),%%xmm0                     \n"
-    "lea       0x8(%0),%0                      \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "psubusw   %%xmm3,%%xmm0                   \n"
     "pmullw    %%xmm2,%%xmm0                   \n"
@@ -2708,9 +3011,9 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     "punpckhwd %%xmm1,%%xmm1                   \n"
     "por       %%xmm4,%%xmm0                   \n"
     "por       %%xmm4,%%xmm1                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,16(%1)                   \n"
-    "lea       32(%1),%1                       \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
 
     "sub       $0x8,%2                         \n"
     "jg        1b                              \n"
@@ -2728,7 +3031,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-CONST uvec8 kShuffleMirror = {
+static uvec8 kShuffleMirror = {
   15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
 };
 
@@ -2736,20 +3039,24 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
     "movdqa    %3,%%xmm5                       \n"
-    "lea       -0x10(%0),%0                    \n"
-    ".p2align  4                               \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0,%2),%%xmm0                  \n"
+    MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0
     "pshufb    %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
   : "m"(kShuffleMirror) // %3
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm5"
 #endif
@@ -2761,10 +3068,11 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
-    "lea       -0x10(%0),%0                    \n"
-    ".p2align  4                               \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0,%2),%%xmm0                  \n"
+    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
     "movdqa    %%xmm0,%%xmm1                   \n"
     "psllw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
@@ -2773,14 +3081,17 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
     "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
     "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1)",%1            \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
     "+r"(temp_width)  // %2
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1"
 #endif
@@ -2790,7 +3101,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_MIRRORROW_UV_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-CONST uvec8 kShuffleMirrorUV = {
+static uvec8 kShuffleMirrorUV = {
   14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
 };
 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
@@ -2798,17 +3109,19 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
     "movdqa    %4,%%xmm1                       \n"
-    "lea       -16(%0,%3,2),%0                 \n"
+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "lea       -16(%0),%0                      \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(-0x10,0) ",%0            \n"
     "pshufb    %%xmm1,%%xmm0                   \n"
     "sub       $8,%3                           \n"
-    "movlpd    %%xmm0,(%1)                     \n"
-    "movhpd    %%xmm0,(%1,%2)                  \n"
-    "lea       8(%1),%1                        \n"
+    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src),      // %0
     "+r"(dst_u),    // %1
@@ -2816,6 +3129,9 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     "+r"(temp_width)  // %3
   : "m"(kShuffleMirrorUV)  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1"
 #endif
@@ -2825,22 +3141,23 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 
 #ifdef HAS_ARGBMIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-CONST uvec8 kARGBShuffleMirror = {
+static uvec8 kARGBShuffleMirror = {
   12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
 };
 
 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
   intptr_t temp_width = static_cast<intptr_t>(width);
   asm volatile (
+    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
     "movdqa    %3,%%xmm5                       \n"
-    "lea       -0x10(%0),%0                    \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0,%2,4),%%xmm0                \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
+    "lea       " MEMLEA(-0x10,0) ",%0          \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src),  // %0
     "+r"(dst),  // %1
@@ -2860,11 +3177,12 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     "pcmpeqb    %%xmm5,%%xmm5                    \n"
     "psrlw      $0x8,%%xmm5                      \n"
     "sub        %1,%2                            \n"
-    ".p2align   4                                \n"
+    ".p2align   2                                \n"
+    BUNDLEALIGN
   "1:                                            \n"
-    "movdqa     (%0),%%xmm0                      \n"
-    "movdqa     0x10(%0),%%xmm1                  \n"
-    "lea        0x20(%0),%0                      \n"
+    "movdqa     " MEMACCESS(0) ",%%xmm0          \n"
+    "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
+    "lea        " MEMLEA(0x20,0) ",%0            \n"
     "movdqa     %%xmm0,%%xmm2                    \n"
     "movdqa     %%xmm1,%%xmm3                    \n"
     "pand       %%xmm5,%%xmm0                    \n"
@@ -2873,9 +3191,9 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     "psrlw      $0x8,%%xmm2                      \n"
     "psrlw      $0x8,%%xmm3                      \n"
     "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm2,(%1,%2)                   \n"
-    "lea        0x10(%1),%1                      \n"
+    "movdqa     %%xmm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1            \n"
     "sub        $0x10,%3                         \n"
     "jg         1b                               \n"
   : "+r"(src_uv),     // %0
@@ -2884,6 +3202,9 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     "+r"(pix)         // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
@@ -2896,11 +3217,12 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "pcmpeqb    %%xmm5,%%xmm5                    \n"
     "psrlw      $0x8,%%xmm5                      \n"
     "sub        %1,%2                            \n"
-    ".p2align   4                                \n"
+    ".p2align   2                                \n"
+    BUNDLEALIGN
   "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     0x10(%0),%%xmm1                  \n"
-    "lea        0x20(%0),%0                      \n"
+    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
+    "lea        " MEMLEA(0x20,0) ",%0            \n"
     "movdqa     %%xmm0,%%xmm2                    \n"
     "movdqa     %%xmm1,%%xmm3                    \n"
     "pand       %%xmm5,%%xmm0                    \n"
@@ -2909,9 +3231,9 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "psrlw      $0x8,%%xmm2                      \n"
     "psrlw      $0x8,%%xmm3                      \n"
     "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqu     %%xmm0,(%1)                      \n"
-    "movdqu     %%xmm2,(%1,%2)                   \n"
-    "lea        0x10(%1),%1                      \n"
+    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
+    "lea        " MEMLEA(0x10,1) ",%1            \n"
     "sub        $0x10,%3                         \n"
     "jg         1b                               \n"
   : "+r"(src_uv),     // %0
@@ -2920,6 +3242,9 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     "+r"(pix)         // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
@@ -2932,17 +3257,18 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
                      int width) {
   asm volatile (
     "sub       %0,%1                             \n"
-    ".p2align   4                                \n"
+    ".p2align   2                                \n"
+    BUNDLEALIGN
   "1:                                            \n"
-    "movdqa    (%0),%%xmm0                       \n"
-    "movdqa    (%0,%1,1),%%xmm1                  \n"
-    "lea       0x10(%0),%0                       \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0           \n"
+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
     "movdqa    %%xmm0,%%xmm2                     \n"
     "punpcklbw %%xmm1,%%xmm0                     \n"
     "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqa    %%xmm0,(%2)                       \n"
-    "movdqa    %%xmm2,0x10(%2)                   \n"
-    "lea       0x20(%2),%2                       \n"
+    "movdqa    %%xmm0," MEMACCESS(2) "           \n"
+    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"
+    "lea       " MEMLEA(0x20,2) ",%2             \n"
     "sub       $0x10,%3                          \n"
     "jg        1b                                \n"
   : "+r"(src_u),     // %0
@@ -2951,6 +3277,9 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     "+r"(width)      // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
 #endif
@@ -2961,17 +3290,18 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
                                uint8* dst_uv, int width) {
   asm volatile (
     "sub       %0,%1                             \n"
-    ".p2align   4                                \n"
+    ".p2align   2                                \n"
+    BUNDLEALIGN
   "1:                                            \n"
-    "movdqu    (%0),%%xmm0                       \n"
-    "movdqu    (%0,%1,1),%%xmm1                  \n"
-    "lea       0x10(%0),%0                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
     "movdqa    %%xmm0,%%xmm2                     \n"
     "punpcklbw %%xmm1,%%xmm0                     \n"
     "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqu    %%xmm0,(%2)                       \n"
-    "movdqu    %%xmm2,0x10(%2)                   \n"
-    "lea       0x20(%2),%2                       \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
+    "lea       " MEMLEA(0x20,2) ",%2             \n"
     "sub       $0x10,%3                          \n"
     "jg        1b                                \n"
   : "+r"(src_u),     // %0
@@ -2980,6 +3310,9 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
     "+r"(width)      // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2"
 #endif
@@ -2990,14 +3323,14 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
 #ifdef HAS_COPYROW_SSE2
 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
   asm volatile (
-    "sub        %0,%1                          \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    %%xmm0,(%0,%1)                  \n"
-    "movdqa    %%xmm1,0x10(%0,%1)              \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
     "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src),   // %0
@@ -3017,7 +3350,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
   size_t width_tmp = static_cast<size_t>(width);
   asm volatile (
     "shr       $0x2,%2                         \n"
-    "rep movsl                                 \n"
+    "rep movsl " MEMMOVESTRING(0,1) "          \n"
   : "+S"(src),  // %0
     "+D"(dst),  // %1
     "+c"(width_tmp) // %2
@@ -3027,11 +3360,12 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
 }
 #endif  // HAS_COPYROW_X86
 
+#ifdef HAS_COPYROW_ERMS
 // Unaligned Multiple of 1.
 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   size_t width_tmp = static_cast<size_t>(width);
   asm volatile (
-    "rep movsb                                 \n"
+    "rep movsb " MEMMOVESTRING(0,1) "          \n"
   : "+S"(src),  // %0
     "+D"(dst),  // %1
     "+c"(width_tmp) // %2
@@ -3039,13 +3373,156 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
   : "memory", "cc"
   );
 }
+#endif  // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2"
+#endif
+  );
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "pcmpeqb   %%xmm0,%%xmm0                   \n"
+    "pslld     $0x18,%%xmm0                    \n"
+    "pcmpeqb   %%xmm1,%%xmm1                   \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm2,%%xmm2                   \n"
+    "punpckhwd %%xmm2,%%xmm3                   \n"
+    "punpcklwd %%xmm2,%%xmm2                   \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
+    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
+    "pand      %%xmm0,%%xmm2                   \n"
+    "pand      %%xmm0,%%xmm3                   \n"
+    "pand      %%xmm1,%%xmm4                   \n"
+    "pand      %%xmm1,%%xmm5                   \n"
+    "por       %%xmm4,%%xmm2                   \n"
+    "por       %%xmm5,%%xmm3                   \n"
+    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  asm volatile (
+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
+    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2"
+#endif
+  );
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
 void SetRow_X86(uint8* dst, uint32 v32, int width) {
   size_t width_tmp = static_cast<size_t>(width);
   asm volatile (
     "shr       $0x2,%1                         \n"
-    "rep stosl                                 \n"
+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
     : "+D"(dst),       // %0
       "+c"(width_tmp)  // %1
     : "a"(v32)         // %2
@@ -3058,7 +3535,7 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
     size_t width_tmp = static_cast<size_t>(width);
     uint32* d = reinterpret_cast<uint32*>(dst);
     asm volatile (
-      "rep stosl                               \n"
+      "rep stosl " MEMSTORESTRING(eax,0) "     \n"
       : "+D"(d),         // %0
         "+c"(width_tmp)  // %1
       : "a"(v32)         // %2
@@ -3073,16 +3550,17 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "sub       $0x10,%2                        \n"
     "jg        1b                              \n"
   : "+r"(src_yuy2),  // %0
@@ -3102,13 +3580,15 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%4,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
     "psrlw     $0x8,%%xmm0                     \n"
@@ -3119,9 +3599,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_yuy2),    // %0
@@ -3130,6 +3611,9 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     "+r"(pix)          // %3
   : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
@@ -3142,11 +3626,12 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
@@ -3155,9 +3640,10 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_yuy2),    // %0
@@ -3166,6 +3652,9 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     "+r"(pix)          // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
 #endif
@@ -3177,17 +3666,18 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
   asm volatile (
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_yuy2),  // %0
     "+r"(dst_y),     // %1
@@ -3207,13 +3697,15 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    (%0,%4,1),%%xmm2                \n"
-    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
     "psrlw     $0x8,%%xmm0                     \n"
@@ -3224,9 +3716,10 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_yuy2),    // %0
@@ -3235,6 +3728,9 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
     "+r"(pix)          // %3
   : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
@@ -3247,11 +3743,12 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
@@ -3260,9 +3757,10 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_yuy2),    // %0
@@ -3271,6 +3769,9 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
     "+r"(pix)          // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
 #endif
@@ -3279,17 +3780,18 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
 
 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
@@ -3308,13 +3810,15 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%4,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
     "pand      %%xmm5,%%xmm0                   \n"
@@ -3325,9 +3829,10 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_uyvy),    // %0
@@ -3336,6 +3841,9 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "+r"(pix)          // %3
   : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
@@ -3348,11 +3856,12 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
@@ -3361,9 +3870,10 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_uyvy),    // %0
@@ -3372,6 +3882,9 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
     "+r"(pix)          // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
 #endif
@@ -3381,17 +3894,18 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
                                uint8* dst_y, int pix) {
   asm volatile (
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_uyvy),  // %0
     "+r"(dst_y),     // %1
@@ -3410,13 +3924,15 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    (%0,%4,1),%%xmm2                \n"
-    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pavgb     %%xmm2,%%xmm0                   \n"
     "pavgb     %%xmm3,%%xmm1                   \n"
     "pand      %%xmm5,%%xmm0                   \n"
@@ -3427,9 +3943,10 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_uyvy),    // %0
@@ -3438,6 +3955,9 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     "+r"(pix)          // %3
   : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
@@ -3450,11 +3970,12 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrlw     $0x8,%%xmm5                     \n"
     "sub       %1,%2                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pand      %%xmm5,%%xmm0                   \n"
     "pand      %%xmm5,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
@@ -3463,9 +3984,10 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
     "packuswb  %%xmm0,%%xmm0                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,(%1,%2)                  \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "sub       $0x10,%3                        \n"
     "jg        1b                              \n"
   : "+r"(src_uyvy),    // %0
@@ -3474,6 +3996,9 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
     "+r"(pix)          // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm5"
 #endif
@@ -3502,19 +4027,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
   "10:                                         \n"
     "test      $0xf,%2                         \n"
     "je        19f                             \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
     "psrlw     $0x8,%%xmm3                     \n"
     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -3523,8 +4048,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
     "jge       10b                             \n"
 
   "19:                                         \n"
@@ -3534,19 +4059,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     // 4 pixel loop.
     ".p2align  2                               \n"
   "41:                                         \n"
-    "movdqu    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    (%1),%%xmm2                     \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
     "psrlw     $0x8,%%xmm3                     \n"
     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -3555,8 +4080,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "jge       41b                             \n"
 
   "49:                                         \n"
@@ -3565,19 +4090,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
     // 1 pixel loop.
   "91:                                         \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
     "psrlw     $0x8,%%xmm3                     \n"
     "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
     "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -3586,8 +4111,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
     "jge       91b                             \n"
   "99:                                         \n"
   : "+r"(src_argb0),    // %0
@@ -3605,7 +4130,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-CONST uvec8 kShuffleAlpha = {
+static uvec8 kShuffleAlpha = {
   3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
   11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
 };
@@ -3639,17 +4164,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
   "10:                                         \n"
     "test      $0xf,%2                         \n"
     "je        19f                             \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
     "pshufb    %4,%%xmm3                       \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -3658,8 +4183,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
     "jge       10b                             \n"
 
   "19:                                         \n"
@@ -3673,17 +4198,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     // 4 pixel loop.
     ".p2align  2                               \n"
   "40:                                         \n"
-    "movdqa    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqa    (%1),%%xmm2                     \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
     "pshufb    %4,%%xmm3                       \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -3692,25 +4217,25 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "jge       40b                             \n"
     "jmp       49f                             \n"
 
     // 4 pixel unaligned loop.
     ".p2align  2                               \n"
   "41:                                         \n"
-    "movdqu    (%0),%%xmm3                     \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    (%1),%%xmm2                     \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
     "pshufb    %4,%%xmm3                       \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -3719,8 +4244,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "jge       41b                             \n"
 
   "49:                                         \n"
@@ -3729,17 +4254,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 
     // 1 pixel loop.
   "91:                                         \n"
-    "movd      (%0),%%xmm3                     \n"
-    "lea       0x4(%0),%0                      \n"
+    "movd      " MEMACCESS(0) ",%%xmm3         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
     "movdqa    %%xmm3,%%xmm0                   \n"
     "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      (%1),%%xmm2                     \n"
+    "movd      " MEMACCESS(1) ",%%xmm2         \n"
     "pshufb    %4,%%xmm3                       \n"
     "pand      %%xmm6,%%xmm2                   \n"
     "paddw     %%xmm7,%%xmm3                   \n"
     "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      (%1),%%xmm1                     \n"
-    "lea       0x4(%1),%1                      \n"
+    "movd      " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "por       %%xmm4,%%xmm0                   \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
@@ -3748,8 +4273,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     "pand      %%xmm5,%%xmm1                   \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x1,%3                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
     "jge       91b                             \n"
   "99:                                         \n"
   : "+r"(src_argb0),    // %0
@@ -3770,26 +4295,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 // aligned to 16 bytes
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
-    "sub       %0,%1                           \n"
     "pcmpeqb   %%xmm4,%%xmm4                   \n"
     "pslld     $0x18,%%xmm4                    \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "psrld     $0x8,%%xmm5                     \n"
 
     // 4 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
     "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
     "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "pand      %%xmm4,%%xmm2                   \n"
     "psrlw     $0x8,%%xmm1                     \n"
@@ -3797,8 +4322,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     "pand      %%xmm5,%%xmm0                   \n"
     "por       %%xmm2,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
@@ -3814,10 +4339,10 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
-CONST uvec8 kShuffleAlpha0 = {
+static uvec8 kShuffleAlpha0 = {
   3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
-CONST uvec8 kShuffleAlpha1 = {
+static uvec8 kShuffleAlpha1 = {
   11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
   15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
@@ -3825,34 +4350,34 @@ CONST uvec8 kShuffleAlpha1 = {
 // aligned to 16 bytes
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
-    "sub       %0,%1                           \n"
     "pcmpeqb   %%xmm3,%%xmm3                   \n"
     "pslld     $0x18,%%xmm3                    \n"
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
 
     // 4 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "punpcklbw %%xmm1,%%xmm1                   \n"
     "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
     "punpckhbw %%xmm2,%%xmm2                   \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "pand      %%xmm3,%%xmm2                   \n"
     "psrlw     $0x8,%%xmm0                     \n"
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "por       %%xmm2,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
@@ -3874,35 +4399,34 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
                              int width) {
   uintptr_t alpha = 0;
   asm volatile (
-    "sub       %0,%1                           \n"
-
     // 4 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movzb     0x3(%0),%3                      \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movd      0x0(%4,%3,4),%%xmm2             \n"
-    "movzb     0x7(%0),%3                      \n"
-    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
     "movlhps   %%xmm3,%%xmm2                   \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
-    "movzb     0xb(%0),%3                      \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movd      0x0(%4,%3,4),%%xmm2             \n"
-    "movzb     0xf(%0),%3                      \n"
-    "movd      0x0(%4,%3,4),%%xmm3             \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
     "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
     "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
     "movlhps   %%xmm3,%%xmm2                   \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),    // %0
     "+r"(dst_argb),    // %1
@@ -3910,6 +4434,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     "+r"(alpha)        // %3
   : "r"(fixed_invtbl8)  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -3923,21 +4450,21 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
   asm volatile (
     "movdqa    %3,%%xmm4                       \n"
     "movdqa    %4,%%xmm5                       \n"
-    "sub       %0,%1                           \n"
 
     // 8 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm4,%%xmm0                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm0                   \n"
     "paddw     %%xmm5,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "movdqa    0x10(%0),%%xmm3                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "psrld     $0x18,%%xmm2                    \n"
     "psrld     $0x18,%%xmm3                    \n"
     "packuswb  %%xmm3,%%xmm2                   \n"
@@ -3949,9 +4476,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     "punpcklwd %%xmm3,%%xmm0                   \n"
     "punpckhwd %%xmm3,%%xmm1                   \n"
     "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "movdqa    %%xmm1,0x10(%0,%1,1)            \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),   // %0
     "+r"(dst_argb),   // %1
@@ -3971,15 +4498,15 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone
-CONST vec8 kARGBToSepiaB = {
+static vec8 kARGBToSepiaB = {
   17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
 };
 
-CONST vec8 kARGBToSepiaG = {
+static vec8 kARGBToSepiaG = {
   22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
 };
 
-CONST vec8 kARGBToSepiaR = {
+static vec8 kARGBToSepiaR = {
   24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
 };
 
@@ -3991,32 +4518,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     "movdqa    %4,%%xmm4                       \n"
 
     // 8 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
     "pmaddubsw %%xmm2,%%xmm6                   \n"
     "phaddw    %%xmm6,%%xmm0                   \n"
     "psrlw     $0x7,%%xmm0                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm3,%%xmm5                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm5                   \n"
     "psrlw     $0x7,%%xmm5                     \n"
     "packuswb  %%xmm5,%%xmm5                   \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "pmaddubsw %%xmm4,%%xmm5                   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
     "phaddw    %%xmm1,%%xmm5                   \n"
     "psrlw     $0x7,%%xmm5                     \n"
     "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     "psrld     $0x18,%%xmm6                    \n"
     "psrld     $0x18,%%xmm1                    \n"
     "packuswb  %%xmm1,%%xmm6                   \n"
@@ -4026,9 +4553,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     "punpcklwd %%xmm5,%%xmm0                   \n"
     "punpckhwd %%xmm5,%%xmm1                   \n"
     "sub       $0x8,%1                         \n"
-    "movdqa    %%xmm0,(%0)                     \n"
-    "movdqa    %%xmm1,0x10(%0)                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "jg        1b                              \n"
   : "+r"(dst_argb),      // %0
     "+r"(width)          // %1
@@ -4046,62 +4573,64 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
   asm volatile (
-    "movd      (%2),%%xmm2                     \n"
-    "movd      0x4(%2),%%xmm3                  \n"
-    "movd      0x8(%2),%%xmm4                  \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
+    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
 
     // 8 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
+    "pmaddubsw %%xmm2,%%xmm7                   \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "pmaddubsw %%xmm3,%%xmm6                   \n"
     "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm6,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm5                   \n"
-    "psraw     $0x7,%%xmm0                     \n"
-    "psraw     $0x7,%%xmm5                     \n"
+    "phaddsw   %%xmm7,%%xmm0                   \n"
+    "phaddsw   %%xmm1,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm0                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm5                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "punpcklbw %%xmm6,%%xmm0                   \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
     "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddsw   %%xmm1,%%xmm5                   \n"
-    "psraw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
+    "pmaddubsw %%xmm4,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm1                   \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm7                   \n"
+    "phaddsw   %%xmm7,%%xmm6                   \n"
+    "psraw     $0x6,%%xmm1                     \n"
+    "psraw     $0x6,%%xmm6                     \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
     "packuswb  %%xmm6,%%xmm6                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "sub       $0x8,%1                         \n"
-    "movdqa    %%xmm0,(%0)                     \n"
-    "movdqa    %%xmm1,0x10(%0)                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "punpcklbw %%xmm6,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "punpcklwd %%xmm1,%%xmm0                   \n"
+    "punpckhwd %%xmm1,%%xmm6                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
     "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "r"(matrix_argb)     // %2
+  : "+r"(src_argb),      // %0
+    "+r"(dst_argb),      // %1
+    "+r"(width)          // %2
+  : "r"(matrix_argb)     // %3
   : "memory", "cc"
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
   );
 }
@@ -4129,14 +4658,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
     // 4 pixel loop.
     ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm1                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
     "punpckhbw %%xmm5,%%xmm1                   \n"
     "pmulhuw   %%xmm2,%%xmm1                   \n"
     "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqa    (%0),%%xmm7                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm7         \n"
     "pmullw    %%xmm3,%%xmm1                   \n"
     "pand      %%xmm6,%%xmm7                   \n"
     "paddw     %%xmm4,%%xmm0                   \n"
@@ -4144,8 +4673,8 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
     "packuswb  %%xmm1,%%xmm0                   \n"
     "por       %%xmm7,%%xmm0                   \n"
     "sub       $0x4,%1                         \n"
-    "movdqa    %%xmm0,(%0)                     \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "jg        1b                              \n"
   : "+r"(dst_argb),       // %0
     "+r"(width)           // %1
@@ -4167,14 +4696,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
                        uint32 value) {
   asm volatile (
     "movd      %3,%%xmm2                       \n"
-    "sub       %0,%1                           \n"
     "punpcklbw %%xmm2,%%xmm2                   \n"
     "punpcklqdq %%xmm2,%%xmm2                  \n"
 
     // 4 pixel loop.
     ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
     "punpckhbw %%xmm1,%%xmm1                   \n"
@@ -4184,8 +4713,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     "psrlw     $0x8,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%0,%1,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
@@ -4205,14 +4734,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
     "pxor      %%xmm5,%%xmm5                   \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
 
     // 4 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    (%0,%1),%%xmm2                  \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "movdqu    %%xmm0,%%xmm1                   \n"
     "movdqu    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm0,%%xmm0                   \n"
@@ -4223,8 +4752,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     "pmulhuw   %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqu    %%xmm0,(%0,%2,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4244,18 +4773,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                      uint8* dst_argb, int width) {
   asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-
     // 4 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    (%0,%1),%%xmm1                  \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqu    %%xmm0,(%0,%2,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4275,18 +4803,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                           uint8* dst_argb, int width) {
   asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-
     // 4 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    (%0,%1),%%xmm1                  \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "psubusb   %%xmm1,%%xmm0                   \n"
     "sub       $0x4,%3                         \n"
-    "movdqu    %%xmm0,(%0,%2,1)                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "jg        1b                              \n"
   : "+r"(src_argb0),  // %0
     "+r"(src_argb1),  // %1
@@ -4301,13 +4828,13 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
-#ifdef HAS_SOBELXROW_SSSE3
+#ifdef HAS_SOBELXROW_SSE2
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
-                     const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "sub       %0,%2                           \n"
@@ -4315,31 +4842,37 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     "pxor      %%xmm5,%%xmm5                   \n"
 
     // 8 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movq      0x2(%0),%%xmm1                  \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "psubw     %%xmm1,%%xmm0                   \n"
-    "movq      (%0,%1,1),%%xmm1                \n"
-    "movq      0x2(%0,%1,1),%%xmm2             \n"
+    BUNDLEALIGN
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
+    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "psubw     %%xmm2,%%xmm1                   \n"
-    "movq      (%0,%2,1),%%xmm2                \n"
-    "movq      0x2(%0,%2,1),%%xmm3             \n"
+    BUNDLEALIGN
+    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
+    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "punpcklbw %%xmm5,%%xmm3                   \n"
     "psubw     %%xmm3,%%xmm2                   \n"
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm1,%%xmm0                   \n"
     "paddw     %%xmm1,%%xmm0                   \n"
-    "pabsw     %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
     "sub       $0x8,%4                         \n"
-    "movq      %%xmm0,(%0,%3,1)                \n"
-    "lea       0x8(%0),%0                      \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
     "jg        1b                              \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -4348,51 +4881,60 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     "+r"(width)        // %4
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
   );
 }
-#endif  // HAS_SOBELXROW_SSSE3
+#endif  // HAS_SOBELXROW_SSE2
 
-#ifdef HAS_SOBELYROW_SSSE3
+#ifdef HAS_SOBELYROW_SSE2
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
-                     uint8* dst_sobely, int width) {
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "sub       %0,%2                           \n"
     "pxor      %%xmm5,%%xmm5                   \n"
 
     // 8 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movq      (%0,%1,1),%%xmm1                \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
     "punpcklbw %%xmm5,%%xmm0                   \n"
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "psubw     %%xmm1,%%xmm0                   \n"
-    "movq      0x1(%0),%%xmm1                  \n"
-    "movq      0x1(%0,%1,1),%%xmm2             \n"
+    BUNDLEALIGN
+    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
+    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
     "punpcklbw %%xmm5,%%xmm1                   \n"
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "psubw     %%xmm2,%%xmm1                   \n"
-    "movq      0x2(%0),%%xmm2                  \n"
-    "movq      0x2(%0,%1,1),%%xmm3             \n"
+    BUNDLEALIGN
+    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
+    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
     "punpcklbw %%xmm5,%%xmm2                   \n"
     "punpcklbw %%xmm5,%%xmm3                   \n"
     "psubw     %%xmm3,%%xmm2                   \n"
     "paddw     %%xmm2,%%xmm0                   \n"
     "paddw     %%xmm1,%%xmm0                   \n"
     "paddw     %%xmm1,%%xmm0                   \n"
-    "pabsw     %%xmm0,%%xmm0                   \n"
+    "pxor      %%xmm1,%%xmm1                   \n"
+    "psubw     %%xmm0,%%xmm1                   \n"
+    "pmaxsw    %%xmm1,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
     "sub       $0x8,%3                         \n"
-    "movq      %%xmm0,(%0,%2,1)                \n"
-    "lea       0x8(%0),%0                      \n"
+    BUNDLEALIGN
+    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
     "jg        1b                              \n"
   : "+r"(src_y0),      // %0
     "+r"(src_y1),      // %1
@@ -4400,12 +4942,15 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     "+r"(width)        // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
   );
 }
-#endif  // HAS_SOBELYROW_SSSE3
+#endif  // HAS_SOBELYROW_SSE2
 
 #ifdef HAS_SOBELROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -4414,18 +4959,19 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
 // G = Sobel
 // B = Sobel
 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+                   uint8* dst_argb, int width) {
   asm volatile (
     "sub       %0,%1                           \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     "pslld     $0x18,%%xmm5                    \n"
 
     // 8 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    (%0,%1,1),%%xmm1                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "paddusb   %%xmm1,%%xmm0                   \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "punpcklbw %%xmm0,%%xmm2                   \n"
@@ -4441,11 +4987,11 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     "por       %%xmm5,%%xmm3                   \n"
     "por       %%xmm5,%%xmm0                   \n"
     "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm1,(%2)                     \n"
-    "movdqa    %%xmm2,0x10(%2)                 \n"
-    "movdqa    %%xmm3,0x20(%2)                 \n"
-    "movdqa    %%xmm0,0x30(%2)                 \n"
-    "lea       0x40(%2),%2                     \n"
+    "movdqa    %%xmm1," MEMACCESS(2) "         \n"
+    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"
+    "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"
+    "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
@@ -4453,6 +4999,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     "+r"(width)        // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
@@ -4460,6 +5009,43 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 }
 #endif  // HAS_SOBELROW_SSE2
 
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  asm volatile (
+    "sub       %0,%1                           \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "pslld     $0x18,%%xmm5                    \n"
+
+    // 8 pixel loop.
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%3                        \n"
+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "jg        1b                              \n"
+  : "+r"(src_sobelx),  // %0
+    "+r"(src_sobely),  // %1
+    "+r"(dst_y),       // %2
+    "+r"(width)        // %3
+  :
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
 #ifdef HAS_SOBELXYROW_SSE2
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
 // A = 255
@@ -4473,11 +5059,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
 
     // 8 pixel loop.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    (%0,%1,1),%%xmm1                \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm0,%%xmm2                   \n"
     "paddusb   %%xmm1,%%xmm2                   \n"
     "movdqa    %%xmm0,%%xmm3                   \n"
@@ -4493,11 +5080,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     "punpcklwd %%xmm0,%%xmm7                   \n"
     "punpckhwd %%xmm0,%%xmm1                   \n"
     "sub       $0x10,%3                        \n"
-    "movdqa    %%xmm6,(%2)                     \n"
-    "movdqa    %%xmm4,0x10(%2)                 \n"
-    "movdqa    %%xmm7,0x20(%2)                 \n"
-    "movdqa    %%xmm1,0x30(%2)                 \n"
-    "lea       0x40(%2),%2                     \n"
+    "movdqa    %%xmm6," MEMACCESS(2) "         \n"
+    "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"
+    "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
     "jg        1b                              \n"
   : "+r"(src_sobelx),  // %0
     "+r"(src_sobely),  // %1
@@ -4505,6 +5092,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     "+r"(width)        // %3
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
@@ -4518,7 +5108,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
                                   const int32* previous_cumsum, int width) {
   asm volatile (
-    "sub       %1,%2                           \n"
     "pxor      %%xmm0,%%xmm0                   \n"
     "pxor      %%xmm1,%%xmm1                   \n"
     "sub       $0x4,%3                         \n"
@@ -4529,8 +5118,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   // 4 pixel loop                              \n"
     ".p2align  2                               \n"
   "40:                                         \n"
-    "movdqu    (%0),%%xmm2                     \n"
-    "lea       0x10(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "movdqa    %%xmm2,%%xmm4                   \n"
     "punpcklbw %%xmm1,%%xmm2                   \n"
     "movdqa    %%xmm2,%%xmm3                   \n"
@@ -4541,22 +5130,23 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     "punpcklwd %%xmm1,%%xmm4                   \n"
     "punpckhwd %%xmm1,%%xmm5                   \n"
     "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqa    (%1,%2,1),%%xmm2                \n"
+    "movdqa    " MEMACCESS(2) ",%%xmm2         \n"
     "paddd     %%xmm0,%%xmm2                   \n"
     "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqa    0x10(%1,%2,1),%%xmm3            \n"
+    "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
     "paddd     %%xmm0,%%xmm3                   \n"
     "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqa    0x20(%1,%2,1),%%xmm4            \n"
+    "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
     "paddd     %%xmm0,%%xmm4                   \n"
     "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqa    0x30(%1,%2,1),%%xmm5            \n"
+    "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
+    "lea       " MEMLEA(0x40,2) ",%2           \n"
     "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqa    %%xmm2,(%1)                     \n"
-    "movdqa    %%xmm3,0x10(%1)                 \n"
-    "movdqa    %%xmm4,0x20(%1)                 \n"
-    "movdqa    %%xmm5,0x30(%1)                 \n"
-    "lea       0x40(%1),%1                     \n"
+    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
+    "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"
+    "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
     "sub       $0x4,%3                         \n"
     "jge       40b                             \n"
 
@@ -4567,15 +5157,16 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
   // 1 pixel loop                              \n"
     ".p2align  2                               \n"
   "10:                                         \n"
-    "movd      (%0),%%xmm2                     \n"
-    "lea       0x4(%0),%0                      \n"
+    "movd      " MEMACCESS(0) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
     "punpcklbw %%xmm1,%%xmm2                   \n"
     "punpcklwd %%xmm1,%%xmm2                   \n"
     "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    (%1,%2,1),%%xmm2                \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "paddd     %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm2,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "sub       $0x1,%3                         \n"
     "jge       10b                             \n"
 
@@ -4598,34 +5189,83 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
                                     int width, int area, uint8* dst,
                                     int count) {
   asm volatile (
-    "movd      %5,%%xmm4                       \n"
-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-    "rcpss     %%xmm4,%%xmm4                   \n"
+    "movd      %5,%%xmm5                       \n"
+    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+    "rcpss     %%xmm5,%%xmm4                   \n"
     "pshufd    $0x0,%%xmm4,%%xmm4              \n"
     "sub       $0x4,%3                         \n"
     "jl        49f                             \n"
+    "cmpl      $0x80,%5                        \n"
+    "ja        40f                             \n"
+
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrld     $0x10,%%xmm6                    \n"
+    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+    "addps     %%xmm6,%%xmm5                   \n"
+    "mulps     %%xmm4,%%xmm5                   \n"
+    "cvtps2dq  %%xmm5,%%xmm5                   \n"
+    "packssdw  %%xmm5,%%xmm5                   \n"
+
+  // 4 pixel small loop                        \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "4:                                         \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    BUNDLEALIGN
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    BUNDLEALIGN
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packssdw  %%xmm3,%%xmm2                   \n"
+    "pmulhuw   %%xmm5,%%xmm0                   \n"
+    "pmulhuw   %%xmm5,%%xmm2                   \n"
+    "packuswb  %%xmm2,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "sub       $0x4,%3                         \n"
+    "jge       4b                              \n"
+    "jmp       49f                             \n"
 
   // 4 pixel loop                              \n"
     ".p2align  2                               \n"
+    BUNDLEALIGN
   "40:                                         \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
-    "psubd     (%0,%4,4),%%xmm0                \n"
-    "psubd     0x10(%0,%4,4),%%xmm1            \n"
-    "psubd     0x20(%0,%4,4),%%xmm2            \n"
-    "psubd     0x30(%0,%4,4),%%xmm3            \n"
-    "lea       0x40(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "psubd     0x10(%1),%%xmm1                 \n"
-    "psubd     0x20(%1),%%xmm2                 \n"
-    "psubd     0x30(%1),%%xmm3                 \n"
-    "paddd     (%1,%4,4),%%xmm0                \n"
-    "paddd     0x10(%1,%4,4),%%xmm1            \n"
-    "paddd     0x20(%1,%4,4),%%xmm2            \n"
-    "paddd     0x30(%1,%4,4),%%xmm3            \n"
-    "lea       0x40(%1),%1                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
+    BUNDLEALIGN
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
+    BUNDLEALIGN
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
     "cvtdq2ps  %%xmm1,%%xmm1                   \n"
     "mulps     %%xmm4,%%xmm0                   \n"
@@ -4641,8 +5281,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     "packssdw  %%xmm1,%%xmm0                   \n"
     "packssdw  %%xmm3,%%xmm2                   \n"
     "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "sub       $0x4,%3                         \n"
     "jge       40b                             \n"
 
@@ -4652,20 +5292,22 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 
   // 1 pixel loop                              \n"
     ".p2align  2                               \n"
+    BUNDLEALIGN
   "10:                                         \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "psubd     (%0,%4,4),%%xmm0                \n"
-    "lea       0x10(%0),%0                     \n"
-    "psubd     (%1),%%xmm0                     \n"
-    "paddd     (%1,%4,4),%%xmm0                \n"
-    "lea       0x10(%1),%1                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
+    BUNDLEALIGN
+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "cvtdq2ps  %%xmm0,%%xmm0                   \n"
     "mulps     %%xmm4,%%xmm0                   \n"
     "cvtps2dq  %%xmm0,%%xmm0                   \n"
     "packssdw  %%xmm0,%%xmm0                   \n"
     "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x4,2) ",%2            \n"
     "sub       $0x1,%3                         \n"
     "jge       10b                             \n"
   "19:                                         \n"
@@ -4676,27 +5318,26 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
   : "r"(static_cast<intptr_t>(width)),  // %4
     "rm"(area)     // %5
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
 #endif
   );
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
-// TODO(fbarchard): Find 64 bit way to avoid masking.
 // Copy ARGB pixels from source image with slope to a row of destination.
-// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
-// an error if movq is used. movd  %%xmm0,%1
-
 LIBYUV_API
 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
                         uint8* dst_argb, const float* src_dudv, int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp = 0;
   asm volatile (
-    "movq      (%3),%%xmm2                     \n"
-    "movq      0x8(%3),%%xmm7                  \n"
+    "movq      " MEMACCESS(3) ",%%xmm2         \n"
+    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
     "shl       $0x10,%1                        \n"
     "add       $0x4,%1                         \n"
     "movd      %1,%%xmm5                       \n"
@@ -4715,46 +5356,31 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     "addps     %%xmm4,%%xmm4                   \n"
 
   // 4 pixel loop                              \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "cvttps2dq %%xmm3,%%xmm1                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-#if defined(__x86_64__)
-    "movd      %%xmm0,%1                       \n"
-    "mov       %1,%5                           \n"
-    "and       $0x0fffffff,%1                  \n"
-    "shr       $32,%5                          \n"
-    "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
-#else
-    "movd      %%xmm0,%1                       \n"
+    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
+    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
+    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
+    "movd      %%xmm0,%k1                      \n"
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%5                       \n"
+    "movd      %%xmm0,%k5                      \n"
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-#endif
-    "movd      (%0,%1,1),%%xmm1                \n"
-    "movd      (%0,%5,1),%%xmm6                \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
     "punpckldq %%xmm6,%%xmm1                   \n"
     "addps     %%xmm4,%%xmm2                   \n"
-    "movq      %%xmm1,(%2)                     \n"
-#if defined(__x86_64__)
-    "movd      %%xmm0,%1                       \n"
-    "mov       %1,%5                           \n"
-    "and       $0x0fffffff,%1                  \n"
-    "shr       $32,%5                          \n"
-#else
-    "movd      %%xmm0,%1                       \n"
+    "movq      %%xmm1," MEMACCESS(2) "         \n"
+    "movd      %%xmm0,%k1                      \n"
     "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%5                       \n"
-#endif
-    "movd      (%0,%1,1),%%xmm0                \n"
-    "movd      (%0,%5,1),%%xmm6                \n"
+    "movd      %%xmm0,%k5                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
     "punpckldq %%xmm6,%%xmm0                   \n"
     "addps     %%xmm4,%%xmm3                   \n"
     "sub       $0x4,%4                         \n"
-    "movq      %%xmm0,0x08(%2)                 \n"
-    "lea       0x10(%2),%2                     \n"
+    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
     "jge       40b                             \n"
 
   "49:                                         \n"
@@ -4762,20 +5388,18 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     "jl        19f                             \n"
 
   // 1 pixel loop                              \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "10:                                         \n"
     "cvttps2dq %%xmm2,%%xmm0                   \n"
     "packssdw  %%xmm0,%%xmm0                   \n"
     "pmaddwd   %%xmm5,%%xmm0                   \n"
     "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm0,%1                       \n"
-#if defined(__x86_64__)
-    "and       $0x0fffffff,%1                  \n"
-#endif
-    "movd      (%0,%1,1),%%xmm0                \n"
+    "movd      %%xmm0,%k1                      \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
     "sub       $0x1,%4                         \n"
-    "movd      %%xmm0,(%2)                     \n"
-    "lea       0x4(%2),%2                      \n"
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x04,2) ",%2           \n"
     "jge       10b                             \n"
   "19:                                         \n"
   : "+r"(src_argb),  // %0
@@ -4786,6 +5410,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     "+r"(temp)   // %5
   :
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
 #endif
@@ -4793,6 +5420,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
+#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                           ptrdiff_t src_stride, int dst_width,
@@ -4818,10 +5446,11 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
 
     // General purpose row blend.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm2)
     "movdqa    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
@@ -4831,56 +5460,64 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
     // Blend 25 / 75.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "25:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
     // Blend 50 / 50.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "50:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
     // Blend 75 / 25.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "75:                                         \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm0)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
     // Blend 100 / 0 - Copy row unchanged.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "100:                                        \n"
-    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -4890,11 +5527,15 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm5"
 #endif
   );
 }
+#endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
@@ -4923,10 +5564,11 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "pxor      %%xmm4,%%xmm4                   \n"
 
     // General purpose row blend.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm2                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2
     "movdqa    %%xmm0,%%xmm1                   \n"
     "movdqa    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm2                   \n"
@@ -4943,56 +5585,64 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
     // Blend 25 / 75.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "25:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
     // Blend 50 / 50.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "50:                                         \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
     // Blend 75 / 25.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "75:                                         \n"
-    "movdqa    (%1),%%xmm1                     \n"
-    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
     // Blend 100 / 0 - Copy row unchanged.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "100:                                        \n"
-    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -5002,6 +5652,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -5009,6 +5662,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
+#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                                     ptrdiff_t src_stride, int dst_width,
@@ -5034,10 +5688,11 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
 
     // General purpose row blend.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm2                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
     "movdqu    %%xmm0,%%xmm1                   \n"
     "punpcklbw %%xmm2,%%xmm0                   \n"
     "punpckhbw %%xmm2,%%xmm1                   \n"
@@ -5047,56 +5702,64 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "psrlw     $0x7,%%xmm1                     \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
     // Blend 25 / 75.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "25:                                         \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm1                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
     // Blend 50 / 50.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "50:                                         \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm1                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
     // Blend 75 / 25.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "75:                                         \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "movdqu    (%1,%4,1),%%xmm0                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
     // Blend 100 / 0 - Copy row unchanged.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "100:                                        \n"
-    "movdqu    (%1),%%xmm0                     \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -5106,11 +5769,15 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm5"
 #endif
   );
 }
+#endif   // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
@@ -5139,10 +5806,11 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "pxor      %%xmm4,%%xmm4                   \n"
 
     // General purpose row blend.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm2                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
     "movdqu    %%xmm0,%%xmm1                   \n"
     "movdqu    %%xmm2,%%xmm3                   \n"
     "punpcklbw %%xmm4,%%xmm2                   \n"
@@ -5159,56 +5827,64 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "paddw     %%xmm3,%%xmm1                   \n"
     "packuswb  %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        1b                              \n"
     "jmp       99f                             \n"
 
     // Blend 25 / 75.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "25:                                         \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm1                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        25b                             \n"
     "jmp       99f                             \n"
 
     // Blend 50 / 50.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "50:                                         \n"
-    "movdqu    (%1),%%xmm0                     \n"
-    "movdqu    (%1,%4,1),%%xmm1                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        50b                             \n"
     "jmp       99f                             \n"
 
     // Blend 75 / 25.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "75:                                         \n"
-    "movdqu    (%1),%%xmm1                     \n"
-    "movdqu    (%1,%4,1),%%xmm0                \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
     "pavgb     %%xmm1,%%xmm0                   \n"
     "pavgb     %%xmm1,%%xmm0                   \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    BUNDLEALIGN
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        75b                             \n"
     "jmp       99f                             \n"
 
     // Blend 100 / 0 - Copy row unchanged.
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "100:                                        \n"
-    "movdqu    (%1),%%xmm0                     \n"
+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     "sub       $0x10,%2                        \n"
-    "movdqu    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
     "jg        100b                            \n"
 
   "99:                                         \n"
@@ -5218,6 +5894,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
   : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
 #endif
@@ -5225,17 +5904,19 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 
+#ifdef HAS_HALFROW_SSE2
 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
                   uint8* dst_uv, int pix) {
   asm volatile (
     "sub       %0,%1                           \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pavgb     (%0,%3),%%xmm0                  \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0
     "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%0,%1)                  \n"
-    "lea       0x10(%0),%0                     \n"
+    MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
     "jg        1b                              \n"
   : "+r"(src_uv),  // %0
     "+r"(dst_uv),  // %1
@@ -5247,23 +5928,26 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
 #endif
   );
 }
+#endif  // HAS_HALFROW_SSE2
 
+#ifdef HAS_ARGBTOBAYERROW_SSSE3
 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
   asm volatile (
+    // NaCL caveat - assumes movd is from GPR
     "movd      %3,%%xmm5                       \n"
     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    ".p2align  4                               \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
     "punpckldq %%xmm1,%%xmm0                   \n"
     "sub       $0x8,%2                         \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_bayer), // %1
@@ -5275,23 +5959,58 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
 #endif
   );
 }
+#endif  // HAS_ARGBTOBAYERROW_SSSE3
+
+#ifdef HAS_ARGBTOBAYERGGROW_SSE2
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+                           uint32 selector, int pix) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrld     $0x8,%%xmm0                     \n"
+    "psrld     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packssdw  %%xmm1,%%xmm0                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x8,%2                         \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_bayer), // %1
+    "+r"(pix)        // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBTOBAYERGGROW_SSE2
 
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
   asm volatile (
-    "movdqa    (%3),%%xmm5                     \n"
-    ".p2align  4                               \n"
+    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
     "sub       $0x8,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
@@ -5307,18 +6026,48 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                     const uint8* shuffler, int pix) {
   asm volatile (
-    "movdqa    (%3),%%xmm5                     \n"
-    ".p2align  4                               \n"
+    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
+    ".p2align  2                               \n"
   "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm5,%%xmm1                   \n"
     "sub       $0x8,%2                         \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "movdqu    %%xmm1,0x10(%1)                 \n"
-    "lea       0x20(%1),%1                     \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(pix)        // %2
+  : "r"(shuffler)    // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
+    "lea       " MEMLEA(0x40,0) ",%0           \n"
+    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+    "sub       $0x10,%2                        \n"
+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
+    "lea       " MEMLEA(0x40,1) ",%1           \n"
     "jg        1b                              \n"
   : "+r"(src_argb),  // %0
     "+r"(dst_argb),  // %1
@@ -5330,27 +6079,156 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
 #endif
   );
 }
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
 
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    "pxor      %%xmm5,%%xmm5                   \n"
+    "mov       " MEMACCESS(4) ",%k2            \n"
+    "cmp       $0x3000102,%k2                  \n"
+    "je        3012f                           \n"
+    "cmp       $0x10203,%k2                    \n"
+    "je        123f                            \n"
+    "cmp       $0x30201,%k2                    \n"
+    "je        321f                            \n"
+    "cmp       $0x2010003,%k2                  \n"
+    "je        2103f                           \n"
+
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(4) ",%2             \n"
+    MEMOP(movzb,0x00,0,2,1) ",%2               \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS(1) "            \n"
+    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
+    MEMOP(movzb,0x00,0,2,1) ",%2               \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
+    BUNDLEALIGN
+    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
+    MEMOP(movzb,0x00,0,2,1) ",%2               \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
+    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
+    MEMOP(movzb,0x00,0,2,1) ",%2               \n"  //  movzb     (%0,%2,1),%2
+    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    "lea       " MEMLEA(0x4,1) ",%1            \n"
+    "sub       $0x1,%3                         \n"
+    "jg        1b                              \n"
+    "jmp       99f                             \n"
+
+    ".p2align  2                               \n"
+  "123:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        123b                            \n"
+    "jmp       99f                             \n"
+
+    ".p2align  2                               \n"
+  "321:                                        \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        321b                            \n"
+    "jmp       99f                             \n"
+
+    ".p2align  2                               \n"
+  "2103:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        2103b                           \n"
+    "jmp       99f                             \n"
+
+    ".p2align  2                               \n"
+  "3012:                                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpckhbw %%xmm5,%%xmm1                   \n"
+    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
+    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
+    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        3012b                           \n"
+
+  "99:                                         \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+d"(pixel_temp),  // %2
+    "+r"(pix)         // %3
+  : "r"(shuffler)      // %4
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
 void I422ToYUY2Row_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_frame, int width) {
  asm volatile (
     "sub       %1,%2                             \n"
-    ".p2align  4                                 \n"
+    ".p2align  2                                 \n"
+    BUNDLEALIGN
   "1:                                            \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq      (%1,%2,1),%%xmm3                  \n"
-    "lea       0x8(%1),%1                        \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
     "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    (%0),%%xmm0                       \n"
-    "lea       0x10(%0),%0                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
     "movdqa    %%xmm0,%%xmm1                     \n"
     "punpcklbw %%xmm2,%%xmm0                     \n"
     "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqu    %%xmm0,(%3)                       \n"
-    "movdqu    %%xmm1,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
+    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
     "sub       $0x10,%4                          \n"
     "jg         1b                               \n"
     : "+r"(src_y),  // %0
@@ -5360,32 +6238,38 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
       "+rm"(width)  // %4
     :
     : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
   );
 }
+#endif  // HAS_I422TOYUY2ROW_SSE2
 
+#ifdef HAS_I422TOUYVYROW_SSE2
 void I422ToUYVYRow_SSE2(const uint8* src_y,
                         const uint8* src_u,
                         const uint8* src_v,
                         uint8* dst_frame, int width) {
  asm volatile (
     "sub        %1,%2                            \n"
-    ".p2align  4                                 \n"
+    ".p2align  2                                 \n"
+    BUNDLEALIGN
   "1:                                            \n"
-    "movq      (%1),%%xmm2                       \n"
-    "movq      (%1,%2,1),%%xmm3                  \n"
-    "lea       0x8(%1),%1                        \n"
+    "movq      " MEMACCESS(1) ",%%xmm2           \n"
+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
+    "lea       " MEMLEA(0x8,1) ",%1              \n"
     "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    (%0),%%xmm0                       \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
     "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       0x10(%0),%0                       \n"
+    "lea       " MEMLEA(0x10,0) ",%0             \n"
     "punpcklbw %%xmm0,%%xmm1                     \n"
     "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqu    %%xmm1,(%3)                       \n"
-    "movdqu    %%xmm2,0x10(%3)                   \n"
-    "lea       0x20(%3),%3                       \n"
+    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
+    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
+    "lea       " MEMLEA(0x20,3) ",%3             \n"
     "sub       $0x10,%4                          \n"
     "jg         1b                               \n"
     : "+r"(src_y),  // %0
@@ -5395,11 +6279,300 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
       "+rm"(width)  // %4
     :
     : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
 #if defined(__SSE2__)
     , "xmm0", "xmm1", "xmm2", "xmm3"
 #endif
   );
 }
+#endif  // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_FIXEDDIV_X86
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+#endif  // HAS_FIXEDDIV_X86
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "pxor      %%xmm3,%%xmm3                   \n"
+
+    // 2 pixel loop.
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "punpcklbw %%xmm3,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm4                   \n"
+    "punpcklwd %%xmm3,%%xmm0                   \n"
+    "punpckhwd %%xmm3,%%xmm4                   \n"
+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "movdqa    %%xmm4,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
+    "addps     " MEMACCESS(3) ",%%xmm0         \n"
+    "addps     " MEMACCESS(3) ",%%xmm4         \n"
+    "movdqa    %%xmm1,%%xmm2                   \n"
+    "movdqa    %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm1,%%xmm2                   \n"
+    "mulps     %%xmm5,%%xmm6                   \n"
+    "mulps     %%xmm2,%%xmm1                   \n"
+    "mulps     %%xmm6,%%xmm5                   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
+    "addps     %%xmm2,%%xmm0                   \n"
+    "addps     %%xmm6,%%xmm4                   \n"
+    "addps     %%xmm1,%%xmm0                   \n"
+    "addps     %%xmm5,%%xmm4                   \n"
+    "cvttps2dq %%xmm0,%%xmm0                   \n"
+    "cvttps2dq %%xmm4,%%xmm4                   \n"
+    "packuswb  %%xmm4,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "sub       $0x2,%2                         \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  asm volatile (
+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
+    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+    // 2 pixel loop.
+    ".p2align  2                               \n"
+  "1:                                          \n"
+    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
+    "lea         " MEMLEA(0x8,0) ",%0          \n"
+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
+    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+    "sub         $0x2,%2                       \n"
+    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
+    "lea         " MEMLEA(0x8,1) ",%1          \n"
+    "jg          1b                            \n"
+    "vzeroupper                                \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(width)      // %2
+  : "r"(poly)        // %3
+  : "memory", "cc"
+#if defined(__SSE2__)
+// TODO(fbarchard): declare ymm usage when applicable.
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOP(movzb,0x00,3,1,4) ",%1               \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOP(movzb,0x01,3,1,4) ",%1               \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOP(movzb,0x02,3,1,4) ",%1               \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
+    MEMOP(movzb,0x03,3,1,4) ",%1               \n"  // movzb 0x3(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  uintptr_t pixel_temp = 0u;
+  asm volatile (
+    // 1 pixel loop.
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movzb     " MEMACCESS(0) ",%1             \n"
+    "lea       " MEMLEA(0x4,0) ",%0            \n"
+    MEMOP(movzb,0x00,3,1,4) ",%1               \n"  // movzb (%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
+    MEMOP(movzb,0x01,3,1,4) ",%1               \n"  // movzb 0x1(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
+    MEMOP(movzb,0x02,3,1,4) ",%1               \n"  // movzb 0x2(%3,%1,4),%1
+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
+    "dec       %2                              \n"
+    "jg        1b                              \n"
+  : "+r"(dst_argb),   // %0
+    "+d"(pixel_temp), // %1
+    "+r"(width)       // %2
+  : "r"(table_argb)   // %3
+  : "memory", "cc");
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  uintptr_t pixel_temp = 0u;
+  uintptr_t table_temp = 0u;
+  asm volatile (
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pcmpeqb   %%xmm4,%%xmm4                   \n"
+    "psllw     $0x8,%%xmm4                     \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+
+    // 4 pixel loop.
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
+    "pmaddubsw %%xmm3,%%xmm0                   \n"
+    "phaddw    %%xmm0,%%xmm0                   \n"
+    "pand      %%xmm4,%%xmm0                   \n"
+    "punpcklwd %%xmm5,%%xmm0                   \n"
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS(2) ",%0             \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS(3) "            \n"
+    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
+    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
+    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
+    BUNDLEALIGN
+    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
+    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
+    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
+    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
+    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
+    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
+
+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+    "add       %5,%1                           \n"
+
+    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
+    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
+    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
+    MEMOP(movzb,0x00,1,0,1) ",%0               \n"  // movzb     (%1,%0,1),%0
+    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
+    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
+    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
+    "sub       $0x4,%4                         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "lea       " MEMLEA(0x10,3) ",%3           \n"
+    "jg        1b                              \n"
+  : "+d"(pixel_temp),  // %0
+    "+a"(table_temp),  // %1
+    "+r"(src_argb),    // %2
+    "+r"(dst_argb),    // %3
+    "+rm"(width)       // %4
+  : "r"(luma),         // %5
+    "rm"(lumacoeff)    // %6
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
 #endif  // defined(__x86_64__) || defined(__i386__)
 
diff --git a/chromium/third_party/libyuv/source/row_win.cc b/chromium/third_party/libyuv/source/row_win.cc
index 4ea06923def..502d25cea4f 100644
--- a/chromium/third_party/libyuv/source/row_win.cc
+++ b/chromium/third_party/libyuv/source/row_win.cc
@@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = {
   15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
 };
 
-static const lvec8 kARGBToY_AVX = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-static const lvec8 kARGBToYJ_AVX = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0,
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
-
 static const vec8 kARGBToU = {
   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
 };
@@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = {
   127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
 };
 
-// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
-static const lvec8 kARGBToU_AVX = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
 static const vec8 kARGBToV = {
   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
@@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = {
   -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
 };
 
-static const lvec8 kARGBToV_AVX = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
-};
-
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kShufARGBToY_AVX = {
+static const lvec32 kPermdARGBToY_AVX = {
   0, 4, 1, 5, 2, 6, 3, 7
 };
 
@@ -124,16 +103,6 @@ static const uvec8 kAddY16 = {
 static const vec16 kAddYJ64 = {
   64, 64, 64, 64, 64, 64, 64, 64
 };
-static const lvec16 kAddYJ64_AVX = {
-  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static const ulvec8 kAddY16_AVX = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
 
 static const uvec8 kAddUV128 = {
   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
@@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = {
   0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
 };
 
-static const ulvec8 kAddUV128_AVX = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
@@ -191,7 +153,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
 
-    align      16
+    align      4
   convertloop:
     movq       xmm0, qword ptr [eax]
     lea        eax,  [eax + 8]
@@ -220,7 +182,7 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
     pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
     pslld      xmm5, 24
 
-    align      16
+    align      4
   convertloop:
     movq       xmm0, qword ptr [eax]
     lea        eax,  [eax + 8]
@@ -249,7 +211,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRGB24ToARGB
 
-    align      16
+    align      4
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -289,7 +251,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
     pslld     xmm5, 24
     movdqa    xmm4, kShuffleMaskRAWToARGB
 
-    align      16
+    align      4
  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + 16]
@@ -349,7 +311,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
     sub       edx, eax
     sub       edx, eax
 
-    align      16
+    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
@@ -399,7 +361,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
     sub       edx, eax
     sub       edx, eax
 
-    align      16
+    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
@@ -445,7 +407,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
     sub       edx, eax
     sub       edx, eax
 
-    align      16
+    align      4
  convertloop:
     movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
@@ -477,12 +439,12 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRGB24
 
-    align      16
+    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm2, [eax + 32]
-    movdqa    xmm3, [eax + 48]
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
@@ -494,13 +456,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
     por       xmm0, xmm4   // 4 bytes from 1 for 0
     pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqa    [edx], xmm0  // store 0
+    movdqu    [edx], xmm0  // store 0
     por       xmm1, xmm5   // 8 bytes from 2 for 1
     psrldq    xmm2, 8      // 4 bytes from 2
     pslldq    xmm3, 4      // 12 bytes from 3 for 2
     por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqa    [edx + 16], xmm1   // store 1
-    movdqa    [edx + 32], xmm2   // store 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -516,12 +478,12 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
     mov       ecx, [esp + 12]  // pix
     movdqa    xmm6, kShuffleMaskARGBToRAW
 
-    align      16
+    align      4
  convertloop:
-    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm2, [eax + 32]
-    movdqa    xmm3, [eax + 48]
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
     pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
@@ -533,13 +495,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
     movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
     por       xmm0, xmm4   // 4 bytes from 1 for 0
     pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqa    [edx], xmm0  // store 0
+    movdqu    [edx], xmm0  // store 0
     por       xmm1, xmm5   // 8 bytes from 2 for 1
     psrldq    xmm2, 8      // 4 bytes from 2
     pslldq    xmm3, 4      // 12 bytes from 3 for 2
     por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqa    [edx + 16], xmm1   // store 1
-    movdqa    [edx + 32], xmm2   // store 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -561,7 +523,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
     pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
     pslld     xmm5, 11
 
-    align      16
+    align      4
  convertloop:
     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
@@ -601,7 +563,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
     pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
     pslld     xmm7, 15
 
-    align      16
+    align      4
  convertloop:
     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0    // B
@@ -639,7 +601,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
     movdqa    xmm3, xmm4       // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
-    align      16
+    align      4
  convertloop:
     movdqa    xmm0, [eax]   // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
@@ -668,7 +630,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -703,7 +665,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -737,11 +699,11 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    vmovdqa    ymm6, kShufARGBToY_AVX
-    vmovdqa    ymm5, kAddY16_AVX
-    vmovdqa    ymm4, kARGBToY_AVX
+    vbroadcastf128 ymm4, kARGBToY
+    vbroadcastf128 ymm5, kAddY16
+    vmovdqa    ymm6, kPermdARGBToY_AVX
 
-    align      16
+    align      4
  convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -777,11 +739,11 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
     mov        eax, [esp + 4]   /* src_argb */
     mov        edx, [esp + 8]   /* dst_y */
     mov        ecx, [esp + 12]  /* pix */
-    vmovdqa    ymm4, kARGBToYJ_AVX
-    vmovdqa    ymm5, kAddYJ64_AVX
-    vmovdqa    ymm6, kShufARGBToY_AVX
+    vbroadcastf128 ymm4, kARGBToYJ
+    vbroadcastf128 ymm5, kAddYJ64
+    vmovdqa    ymm6, kPermdARGBToY_AVX
 
-    align      16
+    align      4
  convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -820,7 +782,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kARGBToY
 
-    align      16
+    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -854,7 +816,7 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
 
-    align      16
+    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -889,7 +851,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -923,7 +885,7 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kBGRAToY
 
-    align      16
+    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -957,7 +919,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -991,7 +953,7 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kABGRToY
 
-    align      16
+    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1025,7 +987,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -1059,7 +1021,7 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     movdqa     xmm5, kAddY16
     movdqa     xmm4, kRGBAToY
 
-    align      16
+    align      4
  convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -1100,7 +1062,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
@@ -1166,7 +1128,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUVJ128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
@@ -1229,12 +1191,12 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // pix
-    vmovdqa    ymm7, kARGBToU_AVX
-    vmovdqa    ymm6, kARGBToV_AVX
-    vmovdqa    ymm5, kAddUV128_AVX
+    vbroadcastf128 ymm5, kAddUV128
+    vbroadcastf128 ymm6, kARGBToV
+    vbroadcastf128 ymm7, kARGBToU
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
@@ -1300,7 +1262,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
@@ -1370,7 +1332,7 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUVJ128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
@@ -1439,7 +1401,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* convert to U and V */
     movdqa     xmm0, [eax]          // U
@@ -1497,7 +1459,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* convert to U and V */
     movdqu     xmm0, [eax]          // U
@@ -1555,7 +1517,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
@@ -1614,7 +1576,7 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
@@ -1675,7 +1637,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
@@ -1741,7 +1703,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
@@ -1811,7 +1773,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
@@ -1877,7 +1839,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
@@ -1947,7 +1909,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqa     xmm0, [eax]
@@ -2013,7 +1975,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
     movdqa     xmm5, kAddUV128
     sub        edi, edx             // stride from u to v
 
-    align      16
+    align      4
  convertloop:
     /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
@@ -2133,7 +2095,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
     vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
     vpxor      ymm4, ymm4, ymm4
 
-    align      16
+    align      4
  convertloop:
     vmovq      xmm0, qword ptr [esi]          //  U
     vmovq      xmm1, qword ptr [esi + edi]    //  V
@@ -2220,7 +2182,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 
-// Read 8 UV from 411.
+// Read 8 UV from 444.
 #define READYUV444 __asm {                                                     \
     __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
     __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
@@ -2239,8 +2201,10 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 
 // Read 2 UV from 411, upsample to 8 UV.
 #define READYUV411 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \
+    __asm movd       xmm0, ebx                                                 \
+    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \
+    __asm movd       xmm1, ebx                                                 \
     __asm lea        esi,  [esi + 2]                                           \
     __asm punpcklbw  xmm0, xmm1           /* UV */                             \
     __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
@@ -2330,7 +2294,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV444
     YUVTORGB
@@ -2374,7 +2338,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
     movdqa     xmm5, kShuffleMaskARGBToRGB24_0
     movdqa     xmm6, kShuffleMaskARGBToRGB24
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -2421,7 +2385,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
     movdqa     xmm5, kShuffleMaskARGBToRAW_0
     movdqa     xmm6, kShuffleMaskARGBToRAW
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -2473,7 +2437,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
     pslld      xmm7, 11
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -2540,7 +2504,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -2573,20 +2537,21 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
                          uint8* dst_argb,
                          int width) {
   __asm {
+    push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ecx, [esp + 12 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
-    READYUV411
+    READYUV411  // modifies EBX
     YUVTORGB
 
     // Step 3: Weave into ARGB
@@ -2603,6 +2568,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
 
     pop        edi
     pop        esi
+    pop        ebx
     ret
   }
 }
@@ -2623,7 +2589,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READNV12
     YUVTORGB
@@ -2661,7 +2627,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READNV12
     YVUTORGB
@@ -2703,7 +2669,7 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV444
     YUVTORGB
@@ -2746,7 +2712,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -2779,20 +2745,21 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
                                    uint8* dst_argb,
                                    int width) {
   __asm {
+    push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // U
-    mov        edi, [esp + 8 + 12]  // V
-    mov        edx, [esp + 8 + 16]  // argb
-    mov        ecx, [esp + 8 + 20]  // width
+    mov        eax, [esp + 12 + 4]   // Y
+    mov        esi, [esp + 12 + 8]   // U
+    mov        edi, [esp + 12 + 12]  // V
+    mov        edx, [esp + 12 + 16]  // argb
+    mov        ecx, [esp + 12 + 20]  // width
     sub        edi, esi
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
-    READYUV411
+    READYUV411  // modifies EBX
     YUVTORGB
 
     // Step 3: Weave into ARGB
@@ -2809,6 +2776,7 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
 
     pop        edi
     pop        esi
+    pop        ebx
     ret
   }
 }
@@ -2829,7 +2797,7 @@ void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READNV12
     YUVTORGB
@@ -2867,7 +2835,7 @@ void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READNV12
     YVUTORGB
@@ -2906,7 +2874,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
     sub        edi, esi
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -2947,7 +2915,7 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
     sub        edi, esi
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -2989,7 +2957,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -3030,7 +2998,7 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
     pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -3070,7 +3038,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
     sub        edi, esi
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -3111,7 +3079,7 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
     sub        edi, esi
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
  convertloop:
     READYUV422
     YUVTORGB
@@ -3156,7 +3124,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
     mov        edx, [esp + 8]       // rgb
     mov        ecx, [esp + 12]      // width
 
-    align      16
+    align      4
  convertloop:
     // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
@@ -3200,7 +3168,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
     movdqa    xmm5, kShuffleMirror
     lea       eax, [eax - 16]
 
-    align      16
+    align      4
  convertloop:
     movdqa    xmm0, [eax + ecx]
     pshufb    xmm0, xmm5
@@ -3229,7 +3197,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
     vmovdqa   ymm5, kShuffleMirror_AVX2
     lea       eax, [eax - 32]
 
-    align      16
+    align      4
  convertloop:
     vmovdqu   ymm0, [eax + ecx]
     vpshufb   ymm0, ymm0, ymm5
@@ -3255,7 +3223,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16]
 
-    align      16
+    align      4
  convertloop:
     movdqu    xmm0, [eax + ecx]
     movdqa    xmm1, xmm0        // swap bytes
@@ -3293,7 +3261,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
     lea       eax, [eax + ecx * 2 - 16]
     sub       edi, edx
 
-    align      16
+    align      4
  convertloop:
     movdqa    xmm0, [eax]
     lea       eax, [eax - 16]
@@ -3322,12 +3290,13 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
     mov       eax, [esp + 4]   // src
     mov       edx, [esp + 8]   // dst
     mov       ecx, [esp + 12]  // width
+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
     movdqa    xmm5, kARGBShuffleMirror
-    lea       eax, [eax - 16]
 
-    align      16
+    align      4
  convertloop:
-    movdqa    xmm0, [eax + ecx * 4]
+    movdqa    xmm0, [eax]
+    lea       eax, [eax - 16]
     pshufb    xmm0, xmm5
     sub       ecx, 4
     movdqa    [edx], xmm0
@@ -3353,7 +3322,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
     lea       eax, [eax - 32]
     vmovdqa   ymm5, kARGBShuffleMirror_AVX2
 
-    align      16
+    align      4
  convertloop:
     vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order
     sub       ecx, 8
@@ -3379,7 +3348,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -3416,7 +3385,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -3454,7 +3423,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3492,7 +3461,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     mov        ecx, [esp + 4 + 16]   // width
     sub        edx, eax
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]      // read 16 U's
     movdqa     xmm1, [eax + edx]  // and 16 V's
@@ -3522,7 +3491,7 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
     mov        ecx, [esp + 4 + 16]   // width
     sub        edx, eax
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]      // read 16 U's
     movdqu     xmm1, [eax + edx]  // and 16 V's
@@ -3554,7 +3523,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     mov        ecx, [esp + 4 + 16]   // width
     sub        edx, eax
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]           // read 32 U's
     vmovdqu    ymm1, [eax + edx]     // and 32 V's
@@ -3584,15 +3553,15 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
     mov        eax, [esp + 4]   // src
     mov        edx, [esp + 8]   // dst
     mov        ecx, [esp + 12]  // count
-    sub        edx, eax
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
-    movdqa     [eax + edx], xmm0
-    movdqa     [eax + edx + 16], xmm1
     lea        eax, [eax + 32]
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
     sub        ecx, 32
     jg         convertloop
     ret
@@ -3634,6 +3603,144 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
 }
 #endif  // HAS_COPYROW_X86
 
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+    align      4
+  convertloop:
+    movdqa     xmm2, [eax]
+    movdqa     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
+    movdqa     xmm4, [edx]
+    movdqa     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqa     [edx], xmm2
+    movdqa     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+    align      4
+  convertloop:
+    vmovdqu    ymm1, [eax]
+    vmovdqu    ymm2, [eax + 32]
+    lea        eax, [eax + 64]
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    pslld      xmm0, 24
+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    psrld      xmm1, 8
+
+    align      4
+  convertloop:
+    movq       xmm2, qword ptr [eax]  // 8 Y's
+    lea        eax, [eax + 8]
+    punpcklbw  xmm2, xmm2
+    punpckhwd  xmm3, xmm2
+    punpcklwd  xmm2, xmm2
+    movdqa     xmm4, [edx]
+    movdqa     xmm5, [edx + 16]
+    pand       xmm2, xmm0
+    pand       xmm3, xmm0
+    pand       xmm4, xmm1
+    pand       xmm5, xmm1
+    por        xmm2, xmm4
+    por        xmm3, xmm5
+    movdqa     [edx], xmm2
+    movdqa     [edx + 16], xmm3
+    lea        edx, [edx + 32]
+    sub        ecx, 8
+    jg         convertloop
+
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+  __asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // count
+    vpcmpeqb   ymm0, ymm0, ymm0
+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+
+    align      4
+  convertloop:
+    vpmovzxbd  ymm1, qword ptr [eax]
+    vpmovzxbd  ymm2, qword ptr [eax + 8]
+    lea        eax, [eax + 16]
+    vpslld     ymm1, ymm1, 24
+    vpslld     ymm2, ymm2, 24
+    vpblendvb  ymm1, ymm1, [edx], ymm0
+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0
+    vmovdqu    [edx], ymm1
+    vmovdqu    [edx + 32], ymm2
+    lea        edx, [edx + 64]
+    sub        ecx, 16
+    jg         convertloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
 #ifdef HAS_SETROW_X86
 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
 __declspec(naked) __declspec(align(16))
@@ -3666,7 +3773,7 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
     lea        ecx, [ebp * 4]
     sub        edx, ecx             // stride - width * 4
 
-    align      16
+    align      4
   convertloop:
     mov        ecx, ebp
     rep stosd
@@ -3693,7 +3800,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3726,7 +3833,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3769,7 +3876,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3804,7 +3911,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3837,7 +3944,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3880,7 +3987,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -3919,7 +4026,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -3950,7 +4057,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -3992,7 +4099,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -4026,7 +4133,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
     pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4057,7 +4164,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4099,7 +4206,7 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4131,7 +4238,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -4162,7 +4269,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -4204,7 +4311,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -4236,7 +4343,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
     mov        edx, [esp + 8]    // dst_y
     mov        ecx, [esp + 12]   // pix
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4267,7 +4374,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4309,7 +4416,7 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
     psrlw      xmm5, 8
     sub        edi, edx
 
-    align      16
+    align      4
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -4479,7 +4586,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 1
+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
     psrlw      xmm7, 15
     pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
     psrlw      xmm6, 8
@@ -4624,13 +4731,12 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
-    sub        edx, eax
     pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
     pslld      xmm4, 24
     pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
     psrld      xmm5, 8
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]      // read 4 pixels
     punpcklbw  xmm0, xmm0       // first 2
@@ -4643,6 +4749,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     pshuflw    xmm2, xmm2, 0FFh
     pmulhuw    xmm1, xmm2       // rgb * a
     movdqa     xmm2, [eax]      // alphas
+    lea        eax, [eax + 16]
     psrlw      xmm0, 8
     pand       xmm2, xmm4
     psrlw      xmm1, 8
@@ -4650,8 +4757,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     pand       xmm0, xmm5       // keep original alphas
     por        xmm0, xmm2
     sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     ret
@@ -4674,33 +4781,33 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     mov        eax, [esp + 4]   // src_argb0
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
-    sub        edx, eax
     pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
     pslld      xmm3, 24
     movdqa     xmm4, kShuffleAlpha0
     movdqa     xmm5, kShuffleAlpha1
 
-    align      16
+    align      4
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
     pmulhuw    xmm0, xmm1       // rgb * a
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqa     xmm2, [eax]      // read 4 pixels
+    movdqu     xmm2, [eax]      // read 4 pixels
     punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
     pmulhuw    xmm1, xmm2       // rgb * a
-    movdqa     xmm2, [eax]      // mask original alpha
+    movdqu     xmm2, [eax]      // mask original alpha
+    lea        eax, [eax + 16]
     pand       xmm2, xmm3
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     por        xmm0, xmm2       // copy original alpha
     sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     ret
@@ -4727,7 +4834,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
     vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
     vpslld     ymm5, ymm5, 24
 
-    align      16
+    align      4
  convertloop:
     vmovdqu    ymm6, [eax]       // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
@@ -4764,11 +4871,10 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     mov        eax, [esp + 8 + 4]   // src_argb0
     mov        edx, [esp + 8 + 8]   // dst_argb
     mov        ecx, [esp + 8 + 12]  // width
-    sub        edx, eax
 
-    align      16
+    align      4
  convertloop:
-    movdqa     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
     movzx      edi, byte ptr [eax + 7]  // second alpha
     punpcklbw  xmm0, xmm0       // first 2
@@ -4779,7 +4885,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     movlhps    xmm2, xmm3
     pmulhuw    xmm0, xmm2       // rgb * a
 
-    movdqa     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]      // read 4 pixels
     movzx      esi, byte ptr [eax + 11]  // third alpha
     movzx      edi, byte ptr [eax + 15]  // forth alpha
     punpckhbw  xmm1, xmm1       // next 2
@@ -4789,11 +4895,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
     pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
     movlhps    xmm2, xmm3
     pmulhuw    xmm1, xmm2       // rgb * a
+    lea        eax, [eax + 16]
 
     packuswb   xmm0, xmm1
     sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
     pop        edi
     pop        esi
@@ -4821,7 +4928,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     sub        edx, eax
     vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
 
-    align      16
+    align      4
  convertloop:
     vmovdqu    ymm6, [eax]       // read 8 pixels.
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
@@ -4860,7 +4967,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     push       esi
     push       edi
 
-    align      16
+    align      4
  convertloop:
     // replace VPGATHER
     movzx      esi, byte ptr [eax + 3]                 // alpha0
@@ -4922,9 +5029,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     mov        ecx, [esp + 12]  /* width */
     movdqa     xmm4, kARGBToYJ
     movdqa     xmm5, kAddYJ64
-    sub        edx, eax
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]  // G
     movdqa     xmm1, [eax + 16]
@@ -4936,6 +5042,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     packuswb   xmm0, xmm0   // 8 G bytes
     movdqa     xmm2, [eax]  // A
     movdqa     xmm3, [eax + 16]
+    lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
     packuswb   xmm2, xmm3
@@ -4947,9 +5054,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     punpcklwd  xmm0, xmm3   // GGGA first 4
     punpckhwd  xmm1, xmm3   // GGGA next 4
     sub        ecx, 8
-    movdqa     [eax + edx], xmm0
-    movdqa     [eax + edx + 16], xmm1
-    lea        eax, [eax + 32]
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
     jg         convertloop
     ret
   }
@@ -4983,7 +5090,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     movdqa     xmm3, kARGBToSepiaG
     movdqa     xmm4, kARGBToSepiaR
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]  // B
     movdqa     xmm6, [eax + 16]
@@ -5033,111 +5140,65 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
 __declspec(naked) __declspec(align(16))
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
-                              int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                              const int8* matrix_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        edx, [esp + 8]   /* matrix_argb */
-    mov        ecx, [esp + 12]  /* width */
-    movd       xmm2, [edx]
-    movd       xmm3, [edx + 4]
-    movd       xmm4, [edx + 8]
-    pshufd     xmm2, xmm2, 0
-    pshufd     xmm3, xmm3, 0
-    pshufd     xmm4, xmm4, 0
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]  /* matrix_argb */
+    movdqu     xmm5, [ecx]
+    pshufd     xmm2, xmm5, 0x00
+    pshufd     xmm3, xmm5, 0x55
+    pshufd     xmm4, xmm5, 0xaa
+    pshufd     xmm5, xmm5, 0xff
+    mov        ecx, [esp + 16]  /* width */
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]  // B
-    movdqa     xmm6, [eax + 16]
+    movdqa     xmm7, [eax + 16]
     pmaddubsw  xmm0, xmm2
-    pmaddubsw  xmm6, xmm2
-    movdqa     xmm5, [eax]  // G
+    pmaddubsw  xmm7, xmm2
+    movdqa     xmm6, [eax]  // G
     movdqa     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm3
+    pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm6   // B
-    phaddsw    xmm5, xmm1   // G
-    psraw      xmm0, 7      // B
-    psraw      xmm5, 7      // G
+    phaddsw    xmm0, xmm7   // B
+    phaddsw    xmm6, xmm1   // G
+    psraw      xmm0, 6      // B
+    psraw      xmm6, 6      // G
     packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
-    movdqa     xmm5, [eax]  // R
-    movdqa     xmm1, [eax + 16]
-    pmaddubsw  xmm5, xmm4
+    packuswb   xmm6, xmm6   // 8 G values
+    punpcklbw  xmm0, xmm6   // 8 BG values
+    movdqa     xmm1, [eax]  // R
+    movdqa     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
-    phaddsw    xmm5, xmm1
-    psraw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    pmaddubsw  xmm7, xmm4
+    phaddsw    xmm1, xmm7   // R
     movdqa     xmm6, [eax]  // A
-    movdqa     xmm1, [eax + 16]
-    psrld      xmm6, 24
-    psrld      xmm1, 24
-    packuswb   xmm6, xmm1
+    movdqa     xmm7, [eax + 16]
+    pmaddubsw  xmm6, xmm5
+    pmaddubsw  xmm7, xmm5
+    phaddsw    xmm6, xmm7   // A
+    psraw      xmm1, 6      // R
+    psraw      xmm6, 6      // A
+    packuswb   xmm1, xmm1   // 8 R values
     packuswb   xmm6, xmm6   // 8 A values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
+    punpcklbw  xmm1, xmm6   // 8 RA values
+    movdqa     xmm6, xmm0   // Weave BG, RA together
+    punpcklwd  xmm0, xmm1   // BGRA first 4
+    punpckhwd  xmm6, xmm1   // BGRA next 4
     sub        ecx, 8
-    movdqa     [eax], xmm0
-    movdqa     [eax + 16], xmm1
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm6
     lea        eax, [eax + 32]
+    lea        edx, [edx + 32]
     jg         convertloop
     ret
   }
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    push       ebp
-    mov        eax, [esp + 16 + 4]   /* dst_argb */
-    mov        edi, [esp + 16 + 8]   /* table_argb */
-    mov        ecx, [esp + 16 + 12]  /* width */
-    xor        ebx, ebx
-    xor        edx, edx
-
-    align      16
- convertloop:
-    mov        ebp, dword ptr [eax]  // BGRA
-    mov        esi, ebp
-    and        ebp, 255
-    shr        esi, 8
-    and        esi, 255
-    mov        bl, [edi + ebp * 4 + 0]  // B
-    mov        dl, [edi + esi * 4 + 1]  // G
-    mov        ebp, dword ptr [eax]  // BGRA
-    mov        esi, ebp
-    shr        ebp, 16
-    shr        esi, 24
-    and        ebp, 255
-    mov        [eax], bl
-    mov        [eax + 1], dl
-    mov        bl, [edi + ebp * 4 + 2]  // R
-    mov        dl, [edi + esi * 4 + 3]  // A
-    mov        [eax + 2], bl
-    mov        [eax + 3], dl
-    lea        eax, [eax + 4]
-    sub        ecx, 1
-    jg         convertloop
-    pop        ebp
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-#endif  // HAS_ARGBCOLORTABLEROW_X86
-
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
 // Aligned to 16 bytes.
@@ -5160,7 +5221,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
     pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
     pslld      xmm6, 24
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]  // read 4 pixels
     punpcklbw  xmm0, xmm5   // first 2 pixels
@@ -5196,13 +5257,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     mov        edx, [esp + 8]   // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
-    sub        edx, eax
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]      // read 4 pixels
+    lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
     punpcklbw  xmm0, xmm0       // first 2
     punpckhbw  xmm1, xmm1       // next 2
@@ -5212,8 +5273,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     sub        ecx, 4
-    movdqa     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     ret
@@ -5233,25 +5294,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
-    sub        esi, eax
-    sub        edx, eax
 
-    align      16
+    align      4
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm2, [eax + esi]  // read 4 pixels from src_argb1
+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    punpcklbw  xmm2, xmm5       // first 2
-    punpckhbw  xmm3, xmm5       // next 2
-    pmulhuw    xmm0, xmm2       // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3       // src_argb0 * src_argb1 next 2
+    punpcklbw  xmm0, xmm0         // first 2
+    punpckhbw  xmm1, xmm1         // next 2
+    punpcklbw  xmm2, xmm5         // first 2
+    punpckhbw  xmm3, xmm5         // next 2
+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    lea        eax, [eax + 16]
+    lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
     sub        ecx, 4
-    movdqu     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     pop        esi
@@ -5272,20 +5333,20 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
     sub        ecx, 4
     jl         convertloop49
 
-    align      16
+    align      4
  convertloop4:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm1, [eax + esi]  // read 4 pixels from src_argb1
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
     sub        ecx, 4
-    movdqu     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jge        convertloop4
 
  convertloop49:
@@ -5294,11 +5355,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
  convertloop1:
     movd       xmm0, [eax]        // read 1 pixels from src_argb0
-    movd       xmm1, [eax + esi]  // read 1 pixels from src_argb1
+    lea        eax, [eax + 4]
+    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    lea        esi, [esi + 4]
     paddusb    xmm0, xmm1         // src_argb0 + src_argb1
     sub        ecx, 1
-    movd       [eax + edx], xmm0
-    lea        eax, [eax + 4]
+    movd       [edx], xmm0
+    lea        edx, [edx + 4]
     jge        convertloop1
 
  convertloop19:
@@ -5319,17 +5382,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
-    align      16
+    align      4
  convertloop:
     movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm1, [eax + esi]  // read 4 pixels from src_argb1
+    lea        eax, [eax + 16]
+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    lea        esi, [esi + 16]
     psubusb    xmm0, xmm1         // src_argb0 - src_argb1
     sub        ecx, 4
-    movdqu     [eax + edx], xmm0
-    lea        eax, [eax + 16]
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
     jg         convertloop
 
     pop        esi
@@ -5349,14 +5412,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5  // constant 0
-    sub        esi, eax
-    sub        edx, eax
+    vpxor      ymm5, ymm5, ymm5     // constant 0
 
-    align      16
+    align      4
  convertloop:
     vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
-    vmovdqu    ymm3, [eax + esi]  // read 8 pixels from src_argb1
+    lea        eax, [eax + 32]
+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    lea        esi, [esi + 32]
     vpunpcklbw ymm0, ymm1, ymm1   // low 4
     vpunpckhbw ymm1, ymm1, ymm1   // high 4
     vpunpcklbw ymm2, ymm3, ymm5   // low 4
@@ -5364,8 +5427,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
     vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
-    vmovdqu    [eax + edx], ymm0
-    lea        eax, [eax + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -5387,15 +5450,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
-    align      16
+    align      4
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
-    vpaddusb   ymm0, ymm0, [eax + esi]  // add 8 pixels from src_argb1
-    vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -5417,15 +5480,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     mov        esi, [esp + 4 + 8]   // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    sub        esi, eax
-    sub        edx, eax
 
-    align      16
+    align      4
  convertloop:
     vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
-    vpsubusb   ymm0, ymm0, [eax + esi]  // src_argb0 - src_argb1
-    vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    lea        esi, [esi + 32]
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
     sub        ecx, 8
     jg         convertloop
 
@@ -5436,14 +5499,14 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBSUBTRACTROW_AVX2
 
-#ifdef HAS_SOBELXROW_SSSE3
+#ifdef HAS_SOBELXROW_SSE2
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
 __declspec(naked) __declspec(align(16))
-void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
-                     const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    const uint8* src_y2, uint8* dst_sobelx, int width) {
   __asm {
     push       esi
     push       edi
@@ -5457,7 +5520,7 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     sub        edx, eax
     pxor       xmm5, xmm5  // constant 0
 
-    align      16
+    align      4
  convertloop:
     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
@@ -5477,7 +5540,9 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pabsw      xmm0, xmm0   // SSSE3.  Could use SSE2 psubusw twice instead.
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
     sub        ecx, 8
     movq       qword ptr [eax + edx], xmm0
@@ -5489,16 +5554,16 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     ret
   }
 }
-#endif  // HAS_SOBELXROW_SSSE3
+#endif  // HAS_SOBELXROW_SSE2
 
-#ifdef HAS_SOBELYROW_SSSE3
+#ifdef HAS_SOBELYROW_SSE2
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
 __declspec(naked) __declspec(align(16))
-void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
-                     uint8* dst_sobely, int width) {
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+                    uint8* dst_sobely, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_y0
@@ -5509,7 +5574,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     sub        edx, eax
     pxor       xmm5, xmm5  // constant 0
 
-    align      16
+    align      4
  convertloop:
     movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
     movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
@@ -5529,7 +5594,9 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pabsw      xmm0, xmm0   // SSSE3.  Could use SSE2 psubusw twice instead.
+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    psubw      xmm1, xmm0
+    pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
     sub        ecx, 8
     movq       qword ptr [eax + edx], xmm0
@@ -5540,7 +5607,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
     ret
   }
 }
-#endif  // HAS_SOBELYROW_SSSE3
+#endif  // HAS_SOBELYROW_SSE2
 
 #ifdef HAS_SOBELROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -5550,7 +5617,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
 // B = Sobel
 __declspec(naked) __declspec(align(16))
 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+                   uint8* dst_argb, int width) {
   __asm {
     push       esi
     mov        eax, [esp + 4 + 4]   // src_sobelx
@@ -5561,7 +5628,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     pcmpeqb    xmm5, xmm5           // alpha 255
     pslld      xmm5, 24             // 0xff000000
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
@@ -5594,6 +5661,36 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 }
 #endif  // HAS_SOBELROW_SSE2
 
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked) __declspec(align(16))
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   // src_sobelx
+    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // width
+    sub        esi, eax
+
+    align      4
+ convertloop:
+    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
+    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    lea        eax, [eax + 16]
+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_SOBELTOPLANEROW_SSE2
+
 #ifdef HAS_SOBELXYROW_SSE2
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
 // A = 255
@@ -5610,9 +5707,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5            // alpha 255
+    pcmpeqb    xmm5, xmm5           // alpha 255
 
-    align      16
+    align      4
  convertloop:
     movdqa     xmm0, [eax]            // read 16 pixels src_sobelx
     movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely
@@ -5666,15 +5763,70 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     mov        eax, topleft  // eax topleft
     mov        esi, botleft  // esi botleft
     mov        edx, width
-    movd       xmm4, area
+    movd       xmm5, area
     mov        edi, dst
     mov        ecx, count
-    cvtdq2ps   xmm4, xmm4
-    rcpss      xmm4, xmm4  // 1.0f / area
+    cvtdq2ps   xmm5, xmm5
+    rcpss      xmm4, xmm5  // 1.0f / area
     pshufd     xmm4, xmm4, 0
     sub        ecx, 4
     jl         l4b
 
+    cmp        area, 128  // 128 pixels will not overflow 15 bits.
+    ja         l4
+
+    pshufd     xmm5, xmm5, 0        // area
+    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    psrld      xmm6, 16
+    cvtdq2ps   xmm6, xmm6
+    addps      xmm5, xmm6           // (65536.0 + area - 1)
+    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
+    packssdw   xmm5, xmm5           // 16 bit shorts
+
+    // 4 pixel loop small blocks.
+    align      4
+  s4:
+    // top left
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + 32]
+    movdqa     xmm3, [eax + 48]
+
+    // - top right
+    psubd      xmm0, [eax + edx * 4]
+    psubd      xmm1, [eax + edx * 4 + 16]
+    psubd      xmm2, [eax + edx * 4 + 32]
+    psubd      xmm3, [eax + edx * 4 + 48]
+    lea        eax, [eax + 64]
+
+    // - bottom left
+    psubd      xmm0, [esi]
+    psubd      xmm1, [esi + 16]
+    psubd      xmm2, [esi + 32]
+    psubd      xmm3, [esi + 48]
+
+    // + bottom right
+    paddd      xmm0, [esi + edx * 4]
+    paddd      xmm1, [esi + edx * 4 + 16]
+    paddd      xmm2, [esi + edx * 4 + 32]
+    paddd      xmm3, [esi + edx * 4 + 48]
+    lea        esi, [esi + 64]
+
+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers
+    packssdw   xmm2, xmm3
+
+    pmulhuw    xmm0, xmm5
+    pmulhuw    xmm2, xmm5
+
+    packuswb   xmm0, xmm2
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    sub        ecx, 4
+    jge        s4
+
+    jmp        l4b
+
     // 4 pixel loop
     align      4
   l4:
@@ -5761,7 +5913,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     mov        edx, cumsum
     mov        esi, previous_cumsum
     mov        ecx, width
-    sub        esi, edx
     pxor       xmm0, xmm0
     pxor       xmm1, xmm1
 
@@ -5788,19 +5939,20 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     punpckhwd  xmm5, xmm1
 
     paddd      xmm0, xmm2
-    movdqa     xmm2, [edx + esi]  // previous row above.
+    movdqa     xmm2, [esi]  // previous row above.
     paddd      xmm2, xmm0
 
     paddd      xmm0, xmm3
-    movdqa     xmm3, [edx + esi + 16]
+    movdqa     xmm3, [esi + 16]
     paddd      xmm3, xmm0
 
     paddd      xmm0, xmm4
-    movdqa     xmm4, [edx + esi + 32]
+    movdqa     xmm4, [esi + 32]
     paddd      xmm4, xmm0
 
     paddd      xmm0, xmm5
-    movdqa     xmm5, [edx + esi + 48]
+    movdqa     xmm5, [esi + 48]
+    lea        esi, [esi + 64]
     paddd      xmm5, xmm0
 
     movdqa     [edx], xmm2
@@ -5824,7 +5976,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     punpcklbw  xmm2, xmm1
     punpcklwd  xmm2, xmm1
     paddd      xmm0, xmm2
-    movdqu     xmm2, [edx + esi]
+    movdqu     xmm2, [esi]
+    lea        esi, [esi + 16]
     paddd      xmm2, xmm0
     movdqu     [edx], xmm2
     lea        edx, [edx + 16]
@@ -5845,7 +5998,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 12]   // src_argb
+    mov        eax, [esp + 12]  // src_argb
     mov        esi, [esp + 16]  // stride
     mov        edx, [esp + 20]  // dst_argb
     mov        ecx, [esp + 24]  // pointer to uv_dudv
@@ -5923,6 +6076,108 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+                          ptrdiff_t src_stride, int dst_width,
+                          int source_y_fraction) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]   // dst_ptr
+    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edx, [esp + 8 + 12]  // src_stride
+    mov        ecx, [esp + 8 + 16]  // dst_width
+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
+    shr        eax, 1
+    // Dispatch to specialized filters if applicable.
+    cmp        eax, 0
+    je         xloop100  // 0 / 128.  Blend 100 / 0.
+    sub        edi, esi
+    cmp        eax, 32
+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
+    cmp        eax, 64
+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
+    cmp        eax, 96
+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
+
+    vmovd      xmm0, eax  // high fraction 0..127
+    neg        eax
+    add        eax, 128
+    vmovd      xmm5, eax  // low fraction 128..1
+    vpunpcklbw xmm5, xmm5, xmm0
+    vpunpcklwd xmm5, xmm5, xmm5
+    vpxor      ymm0, ymm0, ymm0
+    vpermd     ymm5, ymm0, ymm5
+
+    align      4
+  xloop:
+    vmovdqu    ymm0, [esi]
+    vmovdqu    ymm2, [esi + edx]
+    vpunpckhbw ymm1, ymm0, ymm2  // mutates
+    vpunpcklbw ymm0, ymm0, ymm2  // mutates
+    vpmaddubsw ymm0, ymm0, ymm5
+    vpmaddubsw ymm1, ymm1, ymm5
+    vpsrlw     ymm0, ymm0, 7
+    vpsrlw     ymm1, ymm1, 7
+    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    sub        ecx, 32
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    jg         xloop
+    jmp        xloop99
+
+    // Blend 25 / 75.
+    align      4
+  xloop25:
+    vmovdqu    ymm0, [esi]
+    vpavgb     ymm0, ymm0, [esi + edx]
+    vpavgb     ymm0, ymm0, [esi + edx]
+    sub        ecx, 32
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+    align      4
+  xloop50:
+    vmovdqu    ymm0, [esi]
+    vpavgb     ymm0, ymm0, [esi + edx]
+    sub        ecx, 32
+    vmovdqu    [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+    align      4
+  xloop75:
+    vmovdqu    ymm0, [esi + edx]
+    vpavgb     ymm0, ymm0, [esi]
+    vpavgb     ymm0, ymm0, [esi]
+    sub        ecx, 32
+    vmovdqu     [esi + edi], ymm0
+    lea        esi, [esi + 32]
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+    align      4
+  xloop100:
+    rep movsb
+
+  xloop99:
+    pop        edi
+    pop        esi
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
 __declspec(naked) __declspec(align(16))
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -5956,7 +6211,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
 
-    align      16
+    align      4
   xloop:
     movdqa     xmm0, [esi]
     movdqa     xmm2, [esi + edx]
@@ -5975,7 +6230,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      16
+    align      4
   xloop25:
     movdqa     xmm0, [esi]
     movdqa     xmm1, [esi + edx]
@@ -5988,7 +6243,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      16
+    align      4
   xloop50:
     movdqa     xmm0, [esi]
     movdqa     xmm1, [esi + edx]
@@ -6000,7 +6255,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      16
+    align      4
   xloop75:
     movdqa     xmm1, [esi]
     movdqa     xmm0, [esi + edx]
@@ -6013,7 +6268,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      16
+    align      4
   xloop100:
     movdqa     xmm0, [esi]
     sub        ecx, 16
@@ -6027,6 +6282,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     ret
   }
 }
+#endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
@@ -6061,7 +6317,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     punpcklqdq xmm5, xmm5
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
   xloop:
     movdqa     xmm0, [esi]  // row0
     movdqa     xmm2, [esi + edx]  // row1
@@ -6087,7 +6343,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      16
+    align      4
   xloop25:
     movdqa     xmm0, [esi]
     movdqa     xmm1, [esi + edx]
@@ -6100,7 +6356,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      16
+    align      4
   xloop50:
     movdqa     xmm0, [esi]
     movdqa     xmm1, [esi + edx]
@@ -6112,7 +6368,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      16
+    align      4
   xloop75:
     movdqa     xmm1, [esi]
     movdqa     xmm0, [esi + edx]
@@ -6125,7 +6381,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      16
+    align      4
   xloop100:
     movdqa     xmm0, [esi]
     sub        ecx, 16
@@ -6174,7 +6430,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     punpcklwd  xmm5, xmm5
     pshufd     xmm5, xmm5, 0
 
-    align      16
+    align      4
   xloop:
     movdqu     xmm0, [esi]
     movdqu     xmm2, [esi + edx]
@@ -6193,7 +6449,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      16
+    align      4
   xloop25:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
@@ -6206,7 +6462,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      16
+    align      4
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
@@ -6218,7 +6474,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      16
+    align      4
   xloop75:
     movdqu     xmm1, [esi]
     movdqu     xmm0, [esi + edx]
@@ -6231,7 +6487,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      16
+    align      4
   xloop100:
     movdqu     xmm0, [esi]
     sub        ecx, 16
@@ -6279,7 +6535,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     punpcklqdq xmm5, xmm5
     pxor       xmm4, xmm4
 
-    align      16
+    align      4
   xloop:
     movdqu     xmm0, [esi]  // row0
     movdqu     xmm2, [esi + edx]  // row1
@@ -6305,7 +6561,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 25 / 75.
-    align      16
+    align      4
   xloop25:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
@@ -6318,7 +6574,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 50 / 50.
-    align      16
+    align      4
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
@@ -6330,7 +6586,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 75 / 25.
-    align      16
+    align      4
   xloop75:
     movdqu     xmm1, [esi]
     movdqu     xmm0, [esi + edx]
@@ -6343,7 +6599,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
     jmp        xloop99
 
     // Blend 100 / 0 - Copy row unchanged.
-    align      16
+    align      4
   xloop100:
     movdqu     xmm0, [esi]
     sub        ecx, 16
@@ -6370,7 +6626,7 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
     mov        ecx, [esp + 4 + 16]   // pix
     sub        edi, eax
 
-    align      16
+    align      4
   convertloop:
     movdqa     xmm0, [eax]
     pavgb      xmm0, [eax + edx]
@@ -6395,7 +6651,7 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
     mov        ecx, [esp + 4 + 16]   // pix
     sub        edi, eax
 
-    align      16
+    align      4
   convertloop:
     vmovdqu    ymm0, [eax]
     vpavgb     ymm0, ymm0, [eax + edx]
@@ -6421,7 +6677,7 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
     mov        ecx, [esp + 16]   // pix
     pshufd     xmm5, xmm5, 0
 
-    align      16
+    align      4
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -6437,18 +6693,49 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
   }
 }
 
+// Specialized ARGB to Bayer that just isolates G channel.
+__declspec(naked) __declspec(align(16))
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+                           uint32 selector, int pix) {
+  __asm {
+    mov        eax, [esp + 4]    // src_argb
+    mov        edx, [esp + 8]    // dst_bayer
+                                 // selector
+    mov        ecx, [esp + 16]   // pix
+    pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
+    psrld      xmm5, 24
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax, [eax + 32]
+    psrld      xmm0, 8  // Move green to bottom.
+    psrld      xmm1, 8
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packssdw   xmm0, xmm1
+    packuswb   xmm0, xmm1
+    sub        ecx, 8
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jg         wloop
+    ret
+  }
+}
+
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 __declspec(naked) __declspec(align(16))
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                           const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_bayer
+    mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
     movdqa     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
 
-    align      16
+    align      4
   wloop:
     movdqa     xmm0, [eax]
     movdqa     xmm1, [eax + 16]
@@ -6469,12 +6756,12 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                     const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_bayer
+    mov        edx, [esp + 8]    // dst_argb
     mov        ecx, [esp + 12]   // shuffler
     movdqa     xmm5, [ecx]
     mov        ecx, [esp + 16]   // pix
 
-    align      16
+    align      4
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
@@ -6496,13 +6783,12 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                          const uint8* shuffler, int pix) {
   __asm {
     mov        eax, [esp + 4]     // src_argb
-    mov        edx, [esp + 8]     // dst_bayer
+    mov        edx, [esp + 8]     // dst_argb
     mov        ecx, [esp + 12]    // shuffler
-    vmovdqa    xmm5, [ecx]
-    vpermq     ymm5, ymm5, 0x44   // same shuffle in high as low.
+    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
     mov        ecx, [esp + 16]    // pix
 
-    align      16
+    align      4
   wloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
@@ -6519,7 +6805,127 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     ret
   }
 }
-#endif
+#endif  // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+                         const uint8* shuffler, int pix) {
+  __asm {
+    push       ebx
+    push       esi
+    mov        eax, [esp + 8 + 4]    // src_argb
+    mov        edx, [esp + 8 + 8]    // dst_argb
+    mov        esi, [esp + 8 + 12]   // shuffler
+    mov        ecx, [esp + 8 + 16]   // pix
+    pxor       xmm5, xmm5
+
+    mov        ebx, [esi]   // shuffler
+    cmp        ebx, 0x03000102
+    je         shuf_3012
+    cmp        ebx, 0x00010203
+    je         shuf_0123
+    cmp        ebx, 0x00030201
+    je         shuf_0321
+    cmp        ebx, 0x02010003
+    je         shuf_2103
+
+  // TODO(fbarchard): Use one source pointer and 3 offsets.
+  shuf_any1:
+    movzx      ebx, byte ptr [esi]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx], bl
+    movzx      ebx, byte ptr [esi + 1]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 1], bl
+    movzx      ebx, byte ptr [esi + 2]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 2], bl
+    movzx      ebx, byte ptr [esi + 3]
+    movzx      ebx, byte ptr [eax + ebx]
+    mov        [edx + 3], bl
+    lea        eax, [eax + 4]
+    lea        edx, [edx + 4]
+    sub        ecx, 1
+    jg         shuf_any1
+    jmp        shuf99
+
+    align      4
+  shuf_0123:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
+    pshuflw    xmm0, xmm0, 01Bh
+    pshufhw    xmm1, xmm1, 01Bh
+    pshuflw    xmm1, xmm1, 01Bh
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         shuf_0123
+    jmp        shuf99
+
+    align      4
+  shuf_0321:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
+    pshuflw    xmm0, xmm0, 039h
+    pshufhw    xmm1, xmm1, 039h
+    pshuflw    xmm1, xmm1, 039h
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         shuf_0321
+    jmp        shuf99
+
+    align      4
+  shuf_2103:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
+    pshuflw    xmm0, xmm0, 093h
+    pshufhw    xmm1, xmm1, 093h
+    pshuflw    xmm1, xmm1, 093h
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         shuf_2103
+    jmp        shuf99
+
+    align      4
+  shuf_3012:
+    movdqu     xmm0, [eax]
+    lea        eax, [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm5
+    punpckhbw  xmm1, xmm5
+    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
+    pshuflw    xmm0, xmm0, 0C6h
+    pshufhw    xmm1, xmm1, 0C6h
+    pshuflw    xmm1, xmm1, 0C6h
+    packuswb   xmm0, xmm1
+    sub        ecx, 4
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         shuf_3012
+
+  shuf99:
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
 
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
@@ -6542,7 +6948,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
-    align      16
+    align      4
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
@@ -6580,7 +6986,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
     mov        ecx, [esp + 8 + 20]   // width
     sub        edx, esi
 
-    align      16
+    align      4
   convertloop:
     movq       xmm2, qword ptr [esi] // U
     movq       xmm3, qword ptr [esi + edx] // V
@@ -6602,6 +7008,289 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
     ret
   }
 }
+
+#ifdef HAS_FIXEDDIV_X86
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) __declspec(align(16))
+int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+#endif  // HAS_FIXEDDIV_X86
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked) __declspec(align(16))
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* src_argb */
+    mov        edx, [esp + 4 + 8]   /* dst_argb */
+    mov        esi, [esp + 4 + 12]  /* poly */
+    mov        ecx, [esp + 4 + 16]  /* width */
+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
+
+    // 2 pixel loop.
+    align      4
+ convertloop:
+//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+    movq       xmm0, qword ptr [eax]  // BGRABGRA
+    lea        eax, [eax + 8]
+    punpcklbw  xmm0, xmm3
+    movdqa     xmm4, xmm0
+    punpcklwd  xmm0, xmm3  // pixel 0
+    punpckhwd  xmm4, xmm3  // pixel 1
+    cvtdq2ps   xmm0, xmm0  // 4 floats
+    cvtdq2ps   xmm4, xmm4
+    movdqa     xmm1, xmm0  // X
+    movdqa     xmm5, xmm4
+    mulps      xmm0, [esi + 16]  // C1 * X
+    mulps      xmm4, [esi + 16]
+    addps      xmm0, [esi]  // result = C0 + C1 * X
+    addps      xmm4, [esi]
+    movdqa     xmm2, xmm1
+    movdqa     xmm6, xmm5
+    mulps      xmm2, xmm1  // X * X
+    mulps      xmm6, xmm5
+    mulps      xmm1, xmm2  // X * X * X
+    mulps      xmm5, xmm6
+    mulps      xmm2, [esi + 32]  // C2 * X * X
+    mulps      xmm6, [esi + 32]
+    mulps      xmm1, [esi + 48]  // C3 * X * X * X
+    mulps      xmm5, [esi + 48]
+    addps      xmm0, xmm2  // result += C2 * X * X
+    addps      xmm4, xmm6
+    addps      xmm0, xmm1  // result += C3 * X * X * X
+    addps      xmm4, xmm5
+    cvttps2dq  xmm0, xmm0
+    cvttps2dq  xmm4, xmm4
+    packuswb   xmm0, xmm4
+    packuswb   xmm0, xmm0
+    sub        ecx, 2
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+                            uint8* dst_argb, const float* poly,
+                            int width) {
+  __asm {
+    mov        eax, [esp + 4]   /* src_argb */
+    mov        edx, [esp + 8]   /* dst_argb */
+    mov        ecx, [esp + 12]   /* poly */
+    vbroadcastf128 ymm4, [ecx]       // C0
+    vbroadcastf128 ymm5, [ecx + 16]  // C1
+    vbroadcastf128 ymm6, [ecx + 32]  // C2
+    vbroadcastf128 ymm7, [ecx + 48]  // C3
+    mov        ecx, [esp + 16]  /* width */
+
+    // 2 pixel loop.
+    align      4
+ convertloop:
+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
+    lea         eax, [eax + 8]
+    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vmulps      ymm2, ymm0, ymm0  // X * X
+    vmulps      ymm3, ymm0, ymm7  // C3 * X
+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X
+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X
+    vcvttps2dq  ymm0, ymm0
+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000
+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000
+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000
+    sub         ecx, 2
+    vmovq       qword ptr [edx], xmm0
+    lea         edx, [edx + 8]
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+                           int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+    align      4
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    movzx      edx, byte ptr [eax - 4 + 3]
+    movzx      edx, byte ptr [esi + edx * 4 + 3]
+    mov        byte ptr [eax - 4 + 3], dl
+    dec        ecx
+    jg         convertloop
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]   /* dst_argb */
+    mov        esi, [esp + 4 + 8]   /* table_argb */
+    mov        ecx, [esp + 4 + 12]  /* width */
+
+    // 1 pixel loop.
+    align      4
+  convertloop:
+    movzx      edx, byte ptr [eax]
+    lea        eax, [eax + 4]
+    movzx      edx, byte ptr [esi + edx * 4]
+    mov        byte ptr [eax - 4], dl
+    movzx      edx, byte ptr [eax - 4 + 1]
+    movzx      edx, byte ptr [esi + edx * 4 + 1]
+    mov        byte ptr [eax - 4 + 1], dl
+    movzx      edx, byte ptr [eax - 4 + 2]
+    movzx      edx, byte ptr [esi + edx * 4 + 2]
+    mov        byte ptr [eax - 4 + 2], dl
+    dec        ecx
+    jg         convertloop
+
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked) __declspec(align(16))
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                 int width,
+                                 const uint8* luma, uint32 lumacoeff) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]   /* src_argb */
+    mov        edi, [esp + 8 + 8]   /* dst_argb */
+    mov        ecx, [esp + 8 + 12]  /* width */
+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
+    pshufd     xmm2, xmm2, 0
+    pshufd     xmm3, xmm3, 0
+    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    psllw      xmm4, 8
+    pxor       xmm5, xmm5
+
+    // 4 pixel loop.
+    align      4
+  convertloop:
+    movdqu     xmm0, qword ptr [eax]      // generate luma ptr
+    pmaddubsw  xmm0, xmm3
+    phaddw     xmm0, xmm0
+    pand       xmm0, xmm4  // mask out low bits
+    punpcklwd  xmm0, xmm5
+    paddd      xmm0, xmm2  // add table base
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi], dl
+    movzx      edx, byte ptr [eax + 1]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 1], dl
+    movzx      edx, byte ptr [eax + 2]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 2], dl
+    movzx      edx, byte ptr [eax + 3]  // copy alpha.
+    mov        byte ptr [edi + 3], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 4]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 4], dl
+    movzx      edx, byte ptr [eax + 5]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 5], dl
+    movzx      edx, byte ptr [eax + 6]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 6], dl
+    movzx      edx, byte ptr [eax + 7]  // copy alpha.
+    mov        byte ptr [edi + 7], dl
+
+    movd       esi, xmm0
+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32
+
+    movzx      edx, byte ptr [eax + 8]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 8], dl
+    movzx      edx, byte ptr [eax + 9]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 9], dl
+    movzx      edx, byte ptr [eax + 10]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 10], dl
+    movzx      edx, byte ptr [eax + 11]  // copy alpha.
+    mov        byte ptr [edi + 11], dl
+
+    movd       esi, xmm0
+
+    movzx      edx, byte ptr [eax + 12]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 12], dl
+    movzx      edx, byte ptr [eax + 13]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 13], dl
+    movzx      edx, byte ptr [eax + 14]
+    movzx      edx, byte ptr [esi + edx]
+    mov        byte ptr [edi + 14], dl
+    movzx      edx, byte ptr [eax + 15]  // copy alpha.
+    mov        byte ptr [edi + 15], dl
+
+    sub        ecx, 4
+    lea        eax, [eax + 16]
+    lea        edi, [edi + 16]
+    jg         convertloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_x86.asm b/chromium/third_party/libyuv/source/row_x86.asm
index 80a9716bae2..0cb326f8e58 100644
--- a/chromium/third_party/libyuv/source/row_x86.asm
+++ b/chromium/third_party/libyuv/source/row_x86.asm
@@ -28,7 +28,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
     psrlw      m2, m2, 8
 %endif
 
-    ALIGN      16
+    ALIGN      4
 .convertloop:
     mov%2      m0, [src_yuy2q]
     mov%2      m1, [src_yuy2q + mmsize]
@@ -74,7 +74,7 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
     psrlw      m4, m4, 8
     sub        dst_vq, dst_uq
 
-    ALIGN      16
+    ALIGN      4
 .convertloop:
     mov%1      m0, [src_uvq]
     mov%1      m1, [src_uvq + mmsize]
@@ -113,7 +113,7 @@ SplitUVRow a,
 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
     sub        src_vq, src_uq
 
-    ALIGN      16
+    ALIGN      4
 .convertloop:
     mov%1      m0, [src_uq]
     mov%1      m1, [src_vq]
diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc
index 77af420b3f3..4f19742a205 100644
--- a/chromium/third_party/libyuv/source/scale.cc
+++ b/chromium/third_party/libyuv/source/scale.cc
@@ -16,1657 +16,21 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyPlane
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
+// Remove this macro if OVERREAD is safe.
+#define AVOID_OVERREAD 1
+
 static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
 }
 
-static __inline int Half(int v) {
-  return v >= 0 ? ((v + 1) >> 1) : -((-v + 1) >> 1);
-}
-
-// Note: Some SSE2 reference manuals
-// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
-
-// Set the following flag to true to revert to only
-// using the reference implementation ScalePlaneBox(), and
-// NOT the optimized versions. Useful for debugging and
-// when comparing the quality of the resulting YUV planes
-// as produced by the optimized and non-optimized versions.
-static bool use_reference_impl_ = false;
-
-LIBYUV_API
-void SetUseReferenceImpl(bool use) {
-  use_reference_impl_ = use;
-}
-
-// ScaleRowDown2Int also used by planar functions
-// NEON downscalers with interpolation.
-
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEROWDOWN2_NEON
-// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                        uint8* dst, int dst_width);
-
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-
-#define HAS_SCALEROWDOWN4_NEON
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN34_NEON
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-//  to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
-                         ptrdiff_t /* src_stride */,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN38_NEON
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
-                         ptrdiff_t /* src_stride */,
-                         uint8* dst_ptr, int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
-                               ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-// SSE2 downscalers with interpolation.
-// Constants for SSSE3 code
-#elif !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-
-// Offsets for source bytes 0 to 9
-CONST uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-CONST uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-CONST uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-CONST uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-CONST uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-CONST uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-CONST uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-CONST vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-CONST uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-CONST uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-CONST uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-CONST uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-CONST uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-CONST uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_SCALEROWDOWN2_SSE2
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                         ptrdiff_t src_stride,
-                                         uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-
-    align      16
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
-    psrlw      xmm1, 8
-    packuswb   xmm0, xmm1
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
-    psrlw      xmm5, 8
-
-    align      16
-  wloop:
-    movdqu     xmm0, [eax]
-    movdqu     xmm1, [eax + 16]
-    movdqu     xmm2, [eax + esi]
-    movdqu     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm5
-    pand       xmm3, xmm5
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    sub        ecx, 16
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-// Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
-    psrld      xmm5, 24
-    pslld      xmm5, 16
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    pand       xmm0, xmm5
-    pand       xmm1, xmm5
-    packuswb   xmm0, xmm1
-    psrlw      xmm0, 8
-    packuswb   xmm0, xmm0
-    sub        ecx, 8
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-    mov        esi, [esp + 8 + 8]    // src_stride
-    mov        edx, [esp + 8 + 12]   // dst_ptr
-    mov        ecx, [esp + 8 + 16]   // dst_width
-    lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
-    psrlw      xmm7, 8
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, [eax + esi * 2]
-    movdqa     xmm3, [eax + esi * 2 + 16]
-    movdqa     xmm4, [eax + edi]
-    movdqa     xmm5, [eax + edi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm2, xmm4
-    pavgb      xmm3, xmm5
-    pavgb      xmm0, xmm2
-    pavgb      xmm1, xmm3
-
-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
-    psrlw      xmm0, 8
-    movdqa     xmm3, xmm1
-    psrlw      xmm1, 8
-    pand       xmm2, xmm7
-    pand       xmm3, xmm7
-    pavgw      xmm0, xmm2
-    pavgw      xmm1, xmm3
-    packuswb   xmm0, xmm1
-
-    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
-    psrlw      xmm0, 8
-    pand       xmm2, xmm7
-    pavgw      xmm0, xmm2
-    packuswb   xmm0, xmm0
-
-    sub        ecx, 8
-    movq       qword ptr [edx], xmm0
-    lea        edx, [edx + 8]
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm3, kShuf0
-    movdqa     xmm4, kShuf1
-    movdqa     xmm5, kShuf2
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    movdqa     xmm2, xmm1
-    palignr    xmm1, xmm0, 8
-    pshufb     xmm0, xmm3
-    pshufb     xmm1, xmm4
-    pshufb     xmm2, xmm5
-    movq       qword ptr [edx], xmm0
-    movq       qword ptr [edx + 8], xmm1
-    movq       qword ptr [edx + 16], xmm2
-    lea        edx, [edx + 24]
-    sub        ecx, 24
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShuf01
-    movdqa     xmm3, kShuf11
-    movdqa     xmm4, kShuf21
-    movdqa     xmm5, kMadd01
-    movdqa     xmm6, kMadd11
-    movdqa     xmm7, kRound34
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    sub        ecx, 24
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx + 24]
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShuf01
-    movdqa     xmm3, kShuf11
-    movdqa     xmm4, kShuf21
-    movdqa     xmm5, kMadd01
-    movdqa     xmm6, kMadd11
-    movdqa     xmm7, kRound34
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]           // pixels 0..7
-    movdqa     xmm1, [eax + esi]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm2
-    pmaddubsw  xmm0, xmm5
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
-    movdqu     xmm1, [eax + esi + 8]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm3
-    pmaddubsw  xmm0, xmm6
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    movq       qword ptr [edx + 8], xmm0
-    movdqa     xmm0, [eax + 16]      // pixels 16..23
-    movdqa     xmm1, [eax + esi + 16]
-    lea        eax, [eax + 32]
-    pavgb      xmm1, xmm0
-    pavgb      xmm0, xmm1
-    pshufb     xmm0, xmm4
-    movdqa     xmm1, kMadd21
-    pmaddubsw  xmm0, xmm1
-    paddsw     xmm0, xmm7
-    psrlw      xmm0, 2
-    packuswb   xmm0, xmm0
-    sub        ecx, 24
-    movq       qword ptr [edx + 16], xmm0
-    lea        edx, [edx+24]
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    movdqa     xmm4, kShuf38a
-    movdqa     xmm5, kShuf38b
-
-    align      16
-  xloop:
-    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
-    lea        eax, [eax + 32]
-    pshufb     xmm0, xmm4
-    pshufb     xmm1, xmm5
-    paddusb    xmm0, xmm1
-
-    sub        ecx, 12
-    movq       qword ptr [edx], xmm0 // write 12 pixels
-    movhlps    xmm1, xmm0
-    movd       [edx + 8], xmm1
-    lea        edx, [edx + 12]
-    jg         xloop
-
-    ret
-  }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShufAc
-    movdqa     xmm3, kShufAc3
-    movdqa     xmm4, kScaleAc33
-    pxor       xmm5, xmm5
-
-    align      16
-  xloop:
-    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
-    movdqa     xmm6, [eax + esi]
-    movhlps    xmm1, xmm0
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm0, xmm5
-    punpcklbw  xmm1, xmm5
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-    movdqa     xmm6, [eax + esi * 2]
-    lea        eax, [eax + 16]
-    movhlps    xmm7, xmm6
-    punpcklbw  xmm6, xmm5
-    punpcklbw  xmm7, xmm5
-    paddusw    xmm0, xmm6
-    paddusw    xmm1, xmm7
-
-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    psrldq     xmm0, 2
-    paddusw    xmm6, xmm0
-    pshufb     xmm6, xmm2
-
-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    psrldq     xmm1, 2
-    paddusw    xmm7, xmm1
-    pshufb     xmm7, xmm3
-    paddusw    xmm6, xmm7
-
-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
-    packuswb   xmm6, xmm6
-
-    sub        ecx, 6
-    movd       [edx], xmm6           // write 6 pixels
-    psrlq      xmm6, 16
-    movd       [edx + 2], xmm6
-    lea        edx, [edx + 6]
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
-    movdqa     xmm2, kShufAb0
-    movdqa     xmm3, kShufAb1
-    movdqa     xmm4, kShufAb2
-    movdqa     xmm5, kScaleAb2
-
-    align      16
-  xloop:
-    movdqa     xmm0, [eax]           // average 2 rows into xmm0
-    pavgb      xmm0, [eax + esi]
-    lea        eax, [eax + 16]
-
-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
-    pshufb     xmm1, xmm2
-    movdqa     xmm6, xmm0
-    pshufb     xmm6, xmm3
-    paddusw    xmm1, xmm6
-    pshufb     xmm0, xmm4
-    paddusw    xmm1, xmm0
-
-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
-    packuswb   xmm1, xmm1
-
-    sub        ecx, 6
-    movd       [edx], xmm1           // write 6 pixels
-    psrlq      xmm1, 16
-    movd       [edx + 2], xmm1
-    lea        edx, [edx + 6]
-    jg         xloop
-
-    pop        esi
-    ret
-  }
-}
-
-#define HAS_SCALEADDROWS_SSE2
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-__declspec(naked) __declspec(align(16))
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst_ptr, int src_width,
-                              int src_height) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        esi, [esp + 16 + 4]   // src_ptr
-    mov        edx, [esp + 16 + 8]   // src_stride
-    mov        edi, [esp + 16 + 12]  // dst_ptr
-    mov        ecx, [esp + 16 + 16]  // dst_width
-    mov        ebx, [esp + 16 + 20]  // height
-    pxor       xmm4, xmm4
-    dec        ebx
-
-    align      16
-  xloop:
-    // first row
-    movdqa     xmm0, [esi]
-    lea        eax, [esi + edx]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm4
-    punpckhbw  xmm1, xmm4
-    lea        esi, [esi + 16]
-    mov        ebp, ebx
-    test       ebp, ebp
-    je         ydone
-
-    // sum remaining rows
-    align      16
-  yloop:
-    movdqa     xmm2, [eax]       // read 16 pixels
-    lea        eax, [eax + edx]  // advance to next row
-    movdqa     xmm3, xmm2
-    punpcklbw  xmm2, xmm4
-    punpckhbw  xmm3, xmm4
-    paddusw    xmm0, xmm2        // sum 16 words
-    paddusw    xmm1, xmm3
-    sub        ebp, 1
-    jg         yloop
-  ydone:
-    movdqa     [edi], xmm0
-    movdqa     [edi + 16], xmm1
-    lea        edi, [edi + 32]
-
-    sub        ecx, 16
-    jg         xloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEROWDOWN2_SSE2
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%3,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
-                                         ptrdiff_t src_stride,
-                                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
-                                            ptrdiff_t src_stride,
-                                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqu    (%0),%%xmm0                     \n"
-    "movdqu    0x10(%0),%%xmm1                 \n"
-    "movdqu    (%0,%3,1),%%xmm2                \n"
-    "movdqu    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
-  );
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    "pslld     $0x10,%%xmm5                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width) {
-  intptr_t stridex3 = 0;
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0x8,%%xmm7                     \n"
-    "lea       (%4,%4,2),%3                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%4,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    (%0,%4,2),%%xmm2                \n"
-    "movdqa    0x10(%0,%4,2),%%xmm3            \n"
-    "movdqa    (%0,%3,1),%%xmm4                \n"
-    "movdqa    0x10(%0,%3,1),%%xmm5            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm4,%%xmm2                   \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm5,%%xmm3                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "pand      %%xmm7,%%xmm3                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "pavgw     %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "pavgw     %%xmm2,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "lea       0x8(%1),%1                      \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "+r"(stridex3)     // %3
-  : "r"(static_cast<intptr_t>(src_stride))    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
-  );
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm3                       \n"
-    "movdqa    %1,%%xmm4                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm2                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "palignr   $0x8,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movq      %%xmm1,0x8(%1)                  \n"
-    "movq      %%xmm2,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    (%0,%3),%%xmm7                  \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,(%1)                     \n"
-    "movdqu    0x8(%0),%%xmm6                  \n"
-    "movdqu    0x8(%0,%3),%%xmm7               \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x8(%1)                  \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
-    "movdqa    0x10(%0,%3),%%xmm7              \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "r"(static_cast<intptr_t>(src_stride)),  // %3
-    "m"(kMadd21)     // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm6                     \n"
-    "movdqa    (%0,%3,1),%%xmm7                \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,(%1)                     \n"
-    "movdqu    0x8(%0),%%xmm6                  \n"
-    "movdqu    0x8(%0,%3,1),%%xmm7             \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x8(%1)                  \n"
-    "movdqa    0x10(%0),%%xmm6                 \n"
-    "movdqa    0x10(%0,%3,1),%%xmm7            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6,0x10(%1)                 \n"
-    "lea       0x18(%1),%1                     \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-    : "+r"(src_ptr),   // %0
-      "+r"(dst_ptr),   // %1
-      "+r"(dst_width)  // %2
-    : "r"(static_cast<intptr_t>(src_stride)),  // %3
-      "m"(kMadd21)     // %4
-    : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0,(%1)                     \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movd      %%xmm1,0x8(%1)                  \n"
-    "lea       0xc(%1),%1                      \n"
-    "sub       $0xc,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-  : "memory", "cc"
-#if defined(__SSE2__)
-      , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
-  );
-}
-
-static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "movdqa    %3,%%xmm5                       \n"
-  :
-  : "m"(kShufAb0),   // %0
-    "m"(kShufAb1),   // %1
-    "m"(kShufAb2),   // %2
-    "m"(kScaleAb2)   // %3
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "pavgb     (%0,%3,1),%%xmm0                \n"
-    "lea       0x10(%0),%0                     \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pshufb    %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "paddusw   %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "paddusw   %%xmm0,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "sub       $0x6,%2                         \n"
-    "movd      %%xmm1,(%1)                     \n"
-    "psrlq     $0x10,%%xmm1                    \n"
-    "movd      %%xmm1,0x2(%1)                  \n"
-    "lea       0x6(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"(static_cast<intptr_t>(src_stride))  // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
-  );
-}
-
-static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                       ptrdiff_t src_stride,
-                                       uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-  :
-  : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-  );
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    (%0,%3,1),%%xmm6                \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    (%0,%3,2),%%xmm6                \n"
-    "lea       0x10(%0),%0                     \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "pshufb    %%xmm3,%%xmm7                   \n"
-    "paddusw   %%xmm7,%%xmm6                   \n"
-    "pmulhuw   %%xmm4,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "sub       $0x6,%2                         \n"
-    "movd      %%xmm6,(%1)                     \n"
-    "psrlq     $0x10,%%xmm6                    \n"
-    "movd      %%xmm6,0x2(%1)                  \n"
-    "lea       0x6(%1),%1                      \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
-  );
-}
-
-#define HAS_SCALEADDROWS_SSE2
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst_ptr, int src_width, int src_height) {
-  int tmp_height = 0;
-  intptr_t tmp_src = 0;
-  asm volatile (
-    "pxor      %%xmm4,%%xmm4                   \n"
-    "sub       $0x1,%5                         \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "mov       %0,%3                           \n"
-    "add       %6,%0                           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm4,%%xmm0                   \n"
-    "punpckhbw %%xmm4,%%xmm1                   \n"
-    "mov       %5,%2                           \n"
-    "test      %2,%2                           \n"
-    "je        3f                              \n"
-  "2:                                          \n"
-    "movdqa    (%0),%%xmm2                     \n"
-    "add       %6,%0                           \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm4,%%xmm2                   \n"
-    "punpckhbw %%xmm4,%%xmm3                   \n"
-    "paddusw   %%xmm2,%%xmm0                   \n"
-    "paddusw   %%xmm3,%%xmm1                   \n"
-    "sub       $0x1,%2                         \n"
-    "jg        2b                              \n"
-  "3:                                          \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "lea       0x10(%3),%0                     \n"
-    "lea       0x20(%1),%1                     \n"
-    "sub       $0x10,%4                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(tmp_height),  // %2
-    "+r"(tmp_src),     // %3
-    "+r"(src_width),   // %4
-    "+rm"(src_height)  // %5
-  : "rm"(static_cast<intptr_t>(src_stride))  // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
-  );
-}
-
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_MIPS_DSPR2
-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                              uint8* dst, int dst_width);
-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width);
-#define HAS_SCALEROWDOWN4_MIPS_DSPR2
-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                              uint8* dst, int dst_width);
-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                 uint8* dst, int dst_width);
-#define HAS_SCALEROWDOWN34_MIPS_DSPR2
-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                               uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* d, int dst_width);
-#define HAS_SCALEROWDOWN38_MIPS_DSPR2
-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                               uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
-                                     ptrdiff_t src_stride,
-                                     uint8* dst_ptr, int dst_width);
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-// CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            uint8* dst, int dst_width) {
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = src_ptr[1];
-    dst[1] = src_ptr[3];
-    dst += 2;
-    src_ptr += 4;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = src_ptr[1];
-  }
-}
-
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
-    dst += 2;
-    s += 4;
-    t += 4;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
-  }
-}
-
-static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            uint8* dst, int dst_width) {
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = src_ptr[2];
-    dst[1] = src_ptr[6];
-    dst += 2;
-    src_ptr += 8;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = src_ptr[2];
-  }
-}
-
-static void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  uint8* dend = dst + dst_width - 1;
-  do {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
-    dst += 2;
-    src_ptr += 8;
-  } while (dst < dend);
-  if (dst_width & 1) {
-    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
-  }
-}
-
-static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                             uint8* dst, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  uint8* dend = dst + dst_width;
-  do {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[1];
-    dst[2] = src_ptr[3];
-    dst += 3;
-    src_ptr += 4;
-  } while (dst < dend);
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* d, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  uint8* dend = d + dst_width;
-  do {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 * 3 + b0 + 2) >> 2;
-    d[1] = (a1 * 3 + b1 + 2) >> 2;
-    d[2] = (a2 * 3 + b2 + 2) >> 2;
-    d += 3;
-    s += 4;
-    t += 4;
-  } while (d < dend);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* d, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
-  uint8* dend = d + dst_width;
-  do {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
-    d[0] = (a0 + b0 + 1) >> 1;
-    d[1] = (a1 + b1 + 1) >> 1;
-    d[2] = (a2 + b2 + 1) >> 1;
-    d += 3;
-    s += 4;
-    t += 4;
-  } while (d < dend);
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#define BLENDER(a, b, f) (static_cast<int>(a) + \
-    ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
-
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int x, int dx) {
-  for (int j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    xi = x >> 16;
-    a = src_ptr[xi];
-    b = src_ptr[xi + 1];
-    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
-    x += dx;
-    dst_ptr += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int a = src_ptr[xi];
-    int b = src_ptr[xi + 1];
-    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
-  }
-}
-
-static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                             uint8* dst, int dst_width) {
-  assert(dst_width % 3 == 0);
-  for (int x = 0; x < dst_width; x += 3) {
-    dst[0] = src_ptr[0];
-    dst[1] = src_ptr[3];
-    dst[2] = src_ptr[6];
-    dst += 3;
-    src_ptr += 8;
-  }
-}
-
-// 8x3 -> 3x1
-static void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
-                                   ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  intptr_t stride = src_stride;
-  for (int i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-// 8x2 -> 3x1
-static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width) {
-  assert((dst_width % 3 == 0) && (dst_width > 0));
-  intptr_t stride = src_stride;
-  for (int i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
-    src_ptr += 8;
-    dst_ptr += 3;
-  }
-}
-
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  assert(src_width > 0);
-  assert(src_height > 0);
-  for (int x = 0; x < src_width; ++x) {
-    const uint8* s = src_ptr + x;
-    int sum = 0;
-    for (int y = 0; y < src_height; ++y) {
-      sum += s[0];
-      s += src_stride;
-    }
-    dst_ptr[x] = sum;
-  }
-}
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
 
 // Scale plane, 1/2
 // This is an optimized version for scaling down a plane to 1/2 of
@@ -1679,7 +43,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
                             FilterMode filtering) {
   void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
                         uint8* dst_ptr, int dst_width) =
-      filtering ? ScaleRowDown2Box_C : ScaleRowDown2_C;
+    filtering == kFilterNone ? ScaleRowDown2_C :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
+        ScaleRowDown2Box_C);
   int row_stride = src_stride << 1;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
@@ -1692,12 +58,15 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
   }
 #elif defined(HAS_SCALEROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_Unaligned_SSE2 :
-        ScaleRowDown2_Unaligned_SSE2;
+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
+        ScaleRowDown2Box_Unaligned_SSE2);
     if (IS_ALIGNED(src_ptr, 16) &&
         IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
         IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
-      ScaleRowDown2 = filtering ? ScaleRowDown2Box_SSE2 : ScaleRowDown2_SSE2;
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+          ScaleRowDown2Box_SSE2);
     }
   }
 #elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
@@ -1709,6 +78,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   // TODO(fbarchard): Loop through source height to allow odd height.
   for (int y = 0; y < dst_height; ++y) {
     ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
@@ -1753,6 +125,9 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   for (int y = 0; y < dst_height; ++y) {
     ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
     src_ptr += row_stride;
@@ -1816,14 +191,15 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   for (int y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
                      dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
@@ -1831,7 +207,7 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
 
   // Remainder 1 or 2 rows with last row vertically unfiltered
   if ((dst_height % 3) == 2) {
-    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
     ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
@@ -1908,21 +284,22 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
   }
 #endif
 
+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   for (int y = 0; y < dst_height - 2; y += 3) {
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
-    ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
 
   // Remainder 1 or 2 rows with last row vertically unfiltered
   if ((dst_height % 3) == 2) {
-    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 3;
     dst_ptr += dst_stride;
     ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
@@ -1998,24 +375,22 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-
+SAFEBUFFERS
 static void ScalePlaneBox(int src_width, int src_height,
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,
                           const uint8* src_ptr, uint8* dst_ptr) {
   assert(dst_width > 0);
   assert(dst_height > 0);
-  int dx = (Abs(src_width) << 16) / dst_width;
-  int dy = (src_height << 16) / dst_height;
+
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
-  // Negative src_width means horizontally mirror.
-  if (src_width < 0) {
-    x += (dst_width - 1) * dx;
-    dx = -dx;
-    src_width = -src_width;
-  }
-  int maxy = (src_height << 16);
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+             &x, &y, &dx, &dy);
+  const int max_y = (src_height << 16);
   if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxStride) ||
       dst_height * 2 > src_height) {
     uint8* dst = dst_ptr;
@@ -2023,8 +398,8 @@ static void ScalePlaneBox(int src_width, int src_height,
       int iy = y >> 16;
       const uint8* src = src_ptr + iy * src_stride;
       y += dy;
-      if (y > maxy) {
-        y = maxy;
+      if (y > max_y) {
+        y = max_y;
       }
       int boxheight = (y >> 16) - iy;
       ScalePlaneBoxRow_C(dst_width, boxheight,
@@ -2046,6 +421,9 @@ static void ScalePlaneBox(int src_width, int src_height,
     }
 #if defined(HAS_SCALEADDROWS_SSE2)
     if (TestCpuFlag(kCpuHasSSE2) &&
+#ifdef AVOID_OVERREAD
+        IS_ALIGNED(src_width, 16) &&
+#endif
         IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
       ScaleAddRows = ScaleAddRows_SSE2;
     }
@@ -2066,16 +444,25 @@ static void ScalePlaneBox(int src_width, int src_height,
   }
 }
 
-// Scale plane to/from any dimensions, with bilinear interpolation.
-
-void ScalePlaneBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr) {
+// Scale plane down with bilinear interpolation.
+SAFEBUFFERS
+void ScalePlaneBilinearDown(int src_width, int src_height,
+                            int dst_width, int dst_height,
+                            int src_stride, int dst_stride,
+                            const uint8* src_ptr, uint8* dst_ptr,
+                            FilterMode filtering) {
   assert(dst_width > 0);
   assert(dst_height > 0);
   assert(Abs(src_width) <= kMaxStride);
 
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+
   SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
 
   void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
@@ -2103,6 +490,14 @@ void ScalePlaneBilinear(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
     InterpolateRow = InterpolateRow_Any_NEON;
@@ -2119,44 +514,170 @@ void ScalePlaneBilinear(int src_width, int src_height,
     }
   }
 #endif
-  int dx = 0;
-  int dy = 0;
+
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+                          int dst_width, int x, int dx) = ScaleFilterCols_C;
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+
+  const int max_y = (src_height - 1) << 16;
+  for (int j = 0; j < dst_height; ++j) {
+    if (y > max_y) {
+      y = max_y;
+    }
+    int yi = y >> 16;
+    const uint8* src = src_ptr + yi * src_stride;
+    if (filtering == kFilterLinear) {
+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+    }
+    dst_ptr += dst_stride;
+    y += dy;
+  }
+}
+
+// Scale up down with bilinear interpolation.
+SAFEBUFFERS
+void ScalePlaneBilinearUp(int src_width, int src_height,
+                          int dst_width, int dst_height,
+                          int src_stride, int dst_stride,
+                          const uint8* src_ptr, uint8* dst_ptr,
+                          FilterMode filtering) {
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  assert(Abs(dst_width) <= kMaxStride);
+
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
-  if (dst_width <= Abs(src_width)) {
-    dx = (Abs(src_width) << 16) / dst_width;
-    x = (dx >> 1) - 32768;
-  } else if (dst_width > 1) {
-    dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
+
+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
   }
-  // Negative src_width means horizontally mirror.
-  if (src_width < 0) {
-    x += (dst_width - 1) * dx;
-    dx = -dx;
-    src_width = -src_width;
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
+    }
   }
-  if (dst_height <= src_height) {
-    dy = (src_height << 16) / dst_height;
-    y = (dy >> 1) - 32768;
-  } else if (dst_height > 1) {
-    dy = ((src_height - 1) << 16) / (dst_height - 1);
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
   }
-  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+
+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+       int dst_width, int x, int dx) =
+       filtering ? ScaleFilterCols_C : ScaleCols_C;
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+  if (filtering && TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleFilterCols = ScaleFilterCols_SSSE3;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleFilterCols = ScaleColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  int yi = y >> 16;
+  const uint8* src = src_ptr + yi * src_stride;
+  SIMD_ALIGNED(uint8 row[2 * kMaxStride]);
+  uint8* rowptr = row;
+  int rowstride = kMaxStride;
+  int lasty = yi;
+
+  ScaleFilterCols(rowptr, src, dst_width, x, dx);
+  if (src_height > 1) {
+    src += src_stride;
+  }
+  ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+  src += src_stride;
+
   for (int j = 0; j < dst_height; ++j) {
-    if (y > maxy) {
-      y = maxy;
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        src = src_ptr + yi * src_stride;
+      }
+      if (yi != lasty) {
+        ScaleFilterCols(rowptr, src, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src += src_stride;
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
     }
-    int yi = y >> 16;
-    int yf = (y >> 8) & 255;
-    const uint8* src = src_ptr + yi * src_stride;
-    InterpolateRow(row, src, src_stride, src_width, yf);
-    ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
 }
 
-// Scale plane to/from any dimensions, without interpolation.
+// Scale Plane to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
@@ -2165,74 +686,37 @@ static void ScalePlaneSimple(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
                              const uint8* src_ptr, uint8* dst_ptr) {
-  int dx = (Abs(src_width) << 16) / dst_width;
-  int dy = (src_height << 16) / dst_height;
-  int x = dx >> 1;
-  int y = dy >> 1;
-  // Negative src_width means horizontally mirror.
-  if (src_width < 0) {
-    x += (dst_width - 1) * dx;
-    dx = -dx;
-    src_width = -src_width;
+  // Initial source x/y coordinate and step values as 16.16 fixed point.
+  int x = 0;
+  int y = 0;
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+             &x, &y, &dx, &dy);
+
+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+      int dst_width, int x, int dx) = ScaleCols_C;
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleCols = ScaleColsUp2_SSE2;
+    }
+#endif
   }
 
-  for (int j = 0; j < dst_height; ++j) {
-    int xs = x;
-    int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
-    uint8* dst = dst_ptr;
-    for (int i = 0; i < dst_width; ++i) {
-      *dst++ = src[xs >> 16];
-      xs += dx;
-    }
+  for (int i = 0; i < dst_height; ++i) {
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+              dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
 }
 
-// Scale plane to/from any dimensions.
-
-static void ScalePlaneAnySize(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_ptr, uint8* dst_ptr,
-                              FilterMode filtering) {
-  if (!filtering || src_width > kMaxStride) {
-    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src_ptr, dst_ptr);
-  } else {
-    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src_ptr, dst_ptr);
-  }
-}
-
-// Scale plane down, any size
-//
-// This is an optimized version for scaling down a plane to any size.
-// The current implementation is ~10 times faster compared to the
-// reference implementation for e.g. XGA->LowResPAL
-
-static void ScalePlaneDown(int src_width, int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_ptr, uint8* dst_ptr,
-                           FilterMode filtering) {
-  if (!filtering || src_width > kMaxStride) {
-    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src_ptr, dst_ptr);
-  } else if (filtering == kFilterBilinear || dst_height * 2 > src_height) {
-    // between 1/2x and 1x use bilinear
-    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src_ptr, dst_ptr);
-  } else {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src_ptr, dst_ptr);
-  }
-}
-
 // Scale a plane.
-// This function in turn calls a scaling function suitable for handling
-// the desired resolutions.
+// This function dispatches to a specialized scaler based on scale factor.
 
 LIBYUV_API
 void ScalePlane(const uint8* src, int src_stride,
@@ -2240,53 +724,87 @@ void ScalePlane(const uint8* src, int src_stride,
                 uint8* dst, int dst_stride,
                 int dst_width, int dst_height,
                 FilterMode filtering) {
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
+  // Negative height means invert the image.
+  if (src_height < 0) {
+    src_height = -src_height;
+    src = src + (src_height - 1) * src_stride;
+    src_stride = -src_stride;
+  }
+
   // Use specialized scales to improve performance for common resolutions.
   // For example, all the 1/2 scalings will use ScalePlaneDown2()
   if (dst_width == src_width && dst_height == src_height) {
     // Straight copy.
     CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
-  } else if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+    return;
+  }
+  if (dst_width == src_width) {
+    int dy = FixedDiv(src_height, dst_height);
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       dst_width, dst_height,
+                       src_stride, dst_stride, src, dst,
+                       0, 0, dy, 1, filtering);
+    return;
+  }
+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (use_reference_impl_) {
-      // For testing, allow the optimized versions to be disabled.
-      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst, filtering);
-    } else if (4 * dst_width == 3 * src_width &&
-               4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width &&
+        4 * dst_height == 3 * src_height) {
       // optimized, 3/4
       ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src, dst, filtering);
-    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+      return;
+    }
+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
       ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
+      return;
+    }
     // 3/8 rounded up for odd sized chroma height.
-    } else if (8 * dst_width == 3 * src_width &&
-               dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width &&
+        dst_height == ((src_height * 3 + 7) / 8)) {
       // optimized, 3/8
       ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
                        src_stride, dst_stride, src, dst, filtering);
-    } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+      return;
+    }
+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&
                filtering != kFilterBilinear) {
       // optimized, 1/4
       ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
                       src_stride, dst_stride, src, dst, filtering);
-    } else {
-      // Arbitrary downsample
-      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst, filtering);
+      return;
     }
-  } else {
-    // Arbitrary scale up and/or down.
-    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
   }
+  if (filtering == kFilterBox && src_width <= kMaxStride &&
+      dst_height * 2 < src_height  ) {
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+                  src_stride, dst_stride, src, dst);
+    return;
+  }
+  if (filtering && dst_height > src_height && dst_width <= kMaxStride) {
+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+                         src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  if (filtering && src_width <= kMaxStride) {
+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+                           src_stride, dst_stride, src, dst, filtering);
+    return;
+  }
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+                   src_stride, dst_stride, src, dst);
 }
 
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.
-// TODO(fbarchard): Disable UNDER_ALLOCATED_HACK
-#define UNDER_ALLOCATED_HACK 1
 
 LIBYUV_API
 int I420Scale(const uint8* src_y, int src_stride_y,
@@ -2299,47 +817,13 @@ int I420Scale(const uint8* src_y, int src_stride_y,
               int dst_width, int dst_height,
               FilterMode filtering) {
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 ||
-      src_width > 32767 || src_height > 32767) {
+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    int halfheight = Half(src_height);
-    src_y = src_y + (src_height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int src_halfwidth = Half(src_width);
-  int src_halfheight = Half(src_height);
-  int dst_halfwidth = Half(dst_width);
-  int dst_halfheight = Half(dst_height);
-
-#ifdef UNDER_ALLOCATED_HACK
-  // If caller passed width / 2 for stride, adjust halfwidth to match.
-  if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) {
-    src_halfwidth = src_width >> 1;
-  }
-  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) {
-    dst_halfwidth = dst_width >> 1;
-  }
-  // If caller used height / 2 when computing src_v, it will point into what
-  // should be the src_u plane. Detect this and reduce halfheight to match.
-  int uv_src_plane_size = src_halfwidth * src_halfheight;
-  if ((src_height & 1) &&
-      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
-    src_halfheight = src_height >> 1;
-  }
-  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
-  if ((dst_height & 1) &&
-      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
-    dst_halfheight = dst_height >> 1;
-  }
-#endif
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
 
   ScalePlane(src_y, src_stride_y, src_width, src_height,
              dst_y, dst_stride_y, dst_width, dst_height,
@@ -2362,60 +846,15 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
           int dst_stride_y, int dst_stride_u, int dst_stride_v,
           int dst_width, int dst_height,
           bool interpolate) {
-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 ||
-      src_width > 32767 || src_height > 32767) {
-    return -1;
-  }
-  // Negative height means invert the image.
-  if (src_height < 0) {
-    src_height = -src_height;
-    int halfheight = Half(src_height);
-    src_y = src_y + (src_height - 1) * src_stride_y;
-    src_u = src_u + (halfheight - 1) * src_stride_u;
-    src_v = src_v + (halfheight - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  int src_halfwidth = Half(src_width);
-  int src_halfheight = Half(src_height);
-  int dst_halfwidth = Half(dst_width);
-  int dst_halfheight = Half(dst_height);
-  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
-
-#ifdef UNDER_ALLOCATED_HACK
-  // If caller passed width / 2 for stride, adjust halfwidth to match.
-  if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) {
-    src_halfwidth = src_width >> 1;
-  }
-  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) {
-    dst_halfwidth = dst_width >> 1;
-  }
-  // If caller used height / 2 when computing src_v, it will point into what
-  // should be the src_u plane. Detect this and reduce halfheight to match.
-  int uv_src_plane_size = src_halfwidth * src_halfheight;
-  if ((src_height & 1) &&
-      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
-    src_halfheight = src_height >> 1;
-  }
-  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
-  if ((dst_height & 1) &&
-      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
-    dst_halfheight = dst_height >> 1;
-  }
-#endif
-
-  ScalePlane(src_y, src_stride_y, src_width, src_height,
-             dst_y, dst_stride_y, dst_width, dst_height,
-             filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-             filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-             filtering);
-  return 0;
+  return I420Scale(src_y, src_stride_y,
+                   src_u, src_stride_u,
+                   src_v, src_stride_v,
+                   src_width, src_height,
+                   dst_y, dst_stride_y,
+                   dst_u, dst_stride_u,
+                   dst_v, dst_stride_v,
+                   dst_width, dst_height,
+                   interpolate ? kFilterBox : kFilterNone);
 }
 
 // Deprecated api
@@ -2425,15 +864,14 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
                 bool interpolate) {
   if (!src || src_width <= 0 || src_height <= 0 ||
       !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
-      src_width > 32767 || src_height > 32767 ||
       dst_yoffset >= dst_height) {
     return -1;
   }
   dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
-  int src_halfwidth = Half(src_width);
-  int src_halfheight = Half(src_height);
-  int dst_halfwidth = Half(dst_width);
-  int dst_halfheight = Half(dst_height);
+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   int aheight = dst_height - dst_yoffset * 2;  // actual output height
   const uint8* src_y = src;
   const uint8* src_u = src + src_width * src_height;
@@ -2444,9 +882,15 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
                  (dst_yoffset >> 1) * dst_halfwidth;
   uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
                  (dst_yoffset >> 1) * dst_halfwidth;
-  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
-               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
-               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+  return I420Scale(src_y, src_width,
+                   src_u, src_halfwidth,
+                   src_v, src_halfwidth,
+                   src_width, src_height,
+                   dst_y, dst_width,
+                   dst_u, dst_halfwidth,
+                   dst_v, dst_halfwidth,
+                   dst_width, aheight,
+                   interpolate ? kFilterBox : kFilterNone);
 }
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libyuv/source/scale_argb.cc
index 5cf14d949ef..fb2222e2a23 100644
--- a/chromium/third_party/libyuv/source/scale_argb.cc
+++ b/chromium/third_party/libyuv/source/scale_argb.cc
@@ -16,6 +16,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"  // For CopyARGB
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -26,715 +27,9 @@ static __inline int Abs(int v) {
   return v >= 0 ? v : -v;
 }
 
-// ARGB scaling uses bilinear or point, but not box filter.
-#if !defined(LIBYUV_DISABLE_NEON) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            uint8* dst, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_SCALEARGBROWDOWN2_SSE2
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                                   ptrdiff_t /* src_stride */,
-                                   uint8* dst_argb, int dst_width) {
-  __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    lea        eax,  [eax + 32]
-    shufps     xmm0, xmm1, 0xdd
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    ret
-  }
-}
-
-// Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width) {
-  __asm {
-    push       esi
-    mov        eax, [esp + 4 + 4]    // src_argb
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_argb
-    mov        ecx, [esp + 4 + 16]   // dst_width
-
-    align      16
-  wloop:
-    movdqa     xmm0, [eax]
-    movdqa     xmm1, [eax + 16]
-    movdqa     xmm2, [eax + esi]
-    movdqa     xmm3, [eax + esi + 16]
-    lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
-    pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        esi
-    ret
-  }
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
-  __asm {
-    push       ebx
-    push       edi
-    mov        eax, [esp + 8 + 4]    // src_argb
-                                     // src_stride ignored
-    mov        ebx, [esp + 8 + 12]   // src_stepx
-    mov        edx, [esp + 8 + 16]   // dst_argb
-    mov        ecx, [esp + 8 + 20]   // dst_width
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-    align      16
-  wloop:
-    movd       xmm0, [eax]
-    movd       xmm1, [eax + ebx]
-    punpckldq  xmm0, xmm1
-    movd       xmm2, [eax + ebx * 2]
-    movd       xmm3, [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    punpckldq  xmm2, xmm3
-    punpcklqdq xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        edi
-    pop        ebx
-    ret
-  }
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                         ptrdiff_t src_stride,
-                                         int src_stepx,
-                                         uint8* dst_argb, int dst_width) {
-  __asm {
-    push       ebx
-    push       esi
-    push       edi
-    mov        eax, [esp + 12 + 4]    // src_argb
-    mov        esi, [esp + 12 + 8]    // src_stride
-    mov        ebx, [esp + 12 + 12]   // src_stepx
-    mov        edx, [esp + 12 + 16]   // dst_argb
-    mov        ecx, [esp + 12 + 20]   // dst_width
-    lea        esi, [eax + esi]       // row1 pointer
-    lea        ebx, [ebx * 4]
-    lea        edi, [ebx + ebx * 2]
-
-    align      16
-  wloop:
-    movq       xmm0, qword ptr [eax]  // row0 4 pairs
-    movhps     xmm0, qword ptr [eax + ebx]
-    movq       xmm1, qword ptr [eax + ebx * 2]
-    movhps     xmm1, qword ptr [eax + edi]
-    lea        eax,  [eax + ebx * 4]
-    movq       xmm2, qword ptr [esi]  // row1 4 pairs
-    movhps     xmm2, qword ptr [esi + ebx]
-    movq       xmm3, qword ptr [esi + ebx * 2]
-    movhps     xmm3, qword ptr [esi + edi]
-    lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2            // average rows
-    pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
-    pavgb      xmm0, xmm2
-    sub        ecx, 4
-    movdqa     [edx], xmm0
-    lea        edx, [edx + 16]
-    jg         wloop
-
-    pop        edi
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
-// Column scaling unfiltered. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-#define HAS_SCALEARGBCOLS_SSE2
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-    pextrw     eax, xmm2, 1          // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
-
-    // 2 Pixel loop.
-    align      16
-  xloop2:
-    paddd      xmm2, xmm3           // x += dx
-    movd       xmm0, qword ptr [esi + eax * 4]  // 1 source x0 pixels
-    movd       xmm1, qword ptr [esi + edx * 4]  // 1 source x1 pixels
-    punpckldq  xmm0, xmm1           // x0 x1
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-    sub        ecx, 2               // 2 pixels
-    jge        xloop2
- xloop29:
-
-    add        ecx, 2 - 1
-    jl         xloop99
-
-    // 1 pixel remainder
-    movd       xmm0, qword ptr [esi + eax * 4]  // 1 source x0 pixels
-    movd       [edi], xmm0
- xloop99:
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static const uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static const uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                                      int dst_width, int x, int dx) {
-  __asm {
-    push       esi
-    push       edi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
-    movd       xmm2, [esp + 8 + 16]  // x
-    movd       xmm3, [esp + 8 + 20]  // dx
-    movdqa     xmm4, kShuffleColARGB
-    movdqa     xmm5, kShuffleFractions
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
-    psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
-    sub        ecx, 2
-    jl         xloop29
-
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
-    paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
-
-    // 2 Pixel loop.
-    align      16
-  xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9              // 7 bit fractions.
-    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5           // 0000000011111111
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
-    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
-    movq       qword ptr [edi], xmm0
-    lea        edi, [edi + 8]
-    sub        ecx, 2               // 2 pixels
-    jge        xloop2
- xloop29:
-
-    add        ecx, 2 - 1
-    jl         xloop99
-
-    // 1 pixel remainder
-    psrlw      xmm2, 9              // 7 bit fractions.
-    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5           // 00000000
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
-    psrlw      xmm0, 7
-    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
-    movd       [edi], xmm0
- xloop99:
-
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
-#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEARGBROWDOWN2_SSE2
-static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                                   ptrdiff_t /* src_stride */,
-                                   uint8* dst_argb, int dst_width) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "lea       0x20(%0),%0                     \n"
-    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1"
-#endif
-  );
-}
-
-static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width) {
-  asm volatile (
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    (%0,%3,1),%%xmm2                \n"
-    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
-    "lea       0x20(%0),%0                     \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1)                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(dst_width)   // %2
-  : "r"(static_cast<intptr_t>(src_stride))   // %3
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
-  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
-  intptr_t src_stepx_x12 = 0;
-  asm volatile (
-    "lea       0x0(,%1,4),%1                   \n"
-    "lea       (%1,%1,2),%4                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movd      (%0),%%xmm0                     \n"
-    "movd      (%0,%1,1),%%xmm1                \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movd      (%0,%1,2),%%xmm2                \n"
-    "movd      (%0,%4,1),%%xmm3                \n"
-    "lea       (%0,%1,4),%0                    \n"
-    "punpckldq %%xmm3,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(src_stepx_x4),  // %1
-    "+r"(dst_argb),      // %2
-    "+r"(dst_width),     // %3
-    "+r"(src_stepx_x12)  // %4
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                         ptrdiff_t src_stride, int src_stepx,
-                                         uint8* dst_argb, int dst_width) {
-  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
-  intptr_t src_stepx_x12 = 0;
-  intptr_t row1 = static_cast<intptr_t>(src_stride);
-  asm volatile (
-    "lea       0x0(,%1,4),%1                   \n"
-    "lea       (%1,%1,2),%4                    \n"
-    "lea       (%0,%5,1),%5                    \n"
-    ".p2align  4                               \n"
-  "1:                                          \n"
-    "movq      (%0),%%xmm0                     \n"
-    "movhps    (%0,%1,1),%%xmm0                \n"
-    "movq      (%0,%1,2),%%xmm1                \n"
-    "movhps    (%0,%4,1),%%xmm1                \n"
-    "lea       (%0,%1,4),%0                    \n"
-    "movq      (%5),%%xmm2                     \n"
-    "movhps    (%5,%1,1),%%xmm2                \n"
-    "movq      (%5,%1,2),%%xmm3                \n"
-    "movhps    (%5,%4,1),%%xmm3                \n"
-    "lea       (%5,%1,4),%5                    \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "sub       $0x4,%3                         \n"
-    "movdqa    %%xmm0,(%2)                     \n"
-    "lea       0x10(%2),%2                     \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_argb),       // %2
-    "+rm"(dst_width),     // %3
-    "+r"(src_stepx_x12),  // %4
-    "+r"(row1)            // %5
-  :
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-
-#define HAS_SCALEARGBCOLS_SSE2
-static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "sub       $0x2,%2                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    ".p2align  4                               \n"
-  "2:                                          \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "movd      (%1,%3,4),%%xmm0                \n"
-    "movd      (%1,%4,4),%%xmm1                \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "movq      %%xmm0,(%0)                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "sub       $0x2,%2                         \n"
-    "jge       2b                              \n"
-
-  "29:                                         \n"
-    "add       $0x1,%2                         \n"
-    "jl        99f                             \n"
-    "movd      (%1,%3,4),%%xmm0                \n"
-    "movd      %%xmm0,(%0)                     \n"
-  "99:                                         \n"
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+rm"(dst_width),  // %2
-    "+r"(x0),          // %3
-    "+r"(x1)           // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
-  );
-}
-
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-CONST uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-CONST uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                                      int dst_width, int x, int dx) {
-  intptr_t x0 = 0, x1 = 0;
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm5                       \n"
-  :
-  : "m"(kShuffleColARGB),  // %0
-    "m"(kShuffleFractions)  // %1
-  );
-
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "sub       $0x2,%2                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    ".p2align  4                               \n"
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "movq      (%1,%3,4),%%xmm0                \n"
-    "psrlw     $0x9,%%xmm1                     \n"
-    "movhps    (%1,%4,4),%%xmm0                \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0,(%0)                     \n"
-    "lea       0x8(%0),%0                      \n"
-    "sub       $0x2,%2                         \n"
-    "jge       2b                              \n"
-
-  "29:                                         \n"
-    "add       $0x1,%2                         \n"
-    "jl        99f                             \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    "movq      (%1,%3,4),%%xmm0                \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0,(%0)                     \n"
-  "99:                                         \n"
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+rm"(dst_width),  // %2
-    "+r"(x0),          // %3
-    "+r"(x1)           // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc"
-#if defined(__SSE2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
-  );
-}
-#endif  // defined(__x86_64__) || defined(__i386__)
-
-static void ScaleARGBRowDown2_C(const uint8* src_argb,
-                                ptrdiff_t /* src_stride */,
-                                uint8* dst_argb, int dst_width) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
-  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
-
-  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[1];
-    dst[1] = src[3];
-    src += 4;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[1];
-  }
-}
-
-static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                                   uint8* dst_argb, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
-    src_argb += 8;
-    dst_argb += 4;
-  }
-}
-
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
-                            int src_stepx,
-                            uint8* dst_argb, int dst_width) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
-  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
-
-  for (int x = 0; x < dst_width - 1; x += 2) {
-    dst[0] = src[0];
-    dst[1] = src[src_stepx];
-    src += src_stepx * 2;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    dst[0] = src[0];
-  }
-}
-
-static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
-                                      ptrdiff_t src_stride,
-                                      int src_stepx,
-                                      uint8* dst_argb, int dst_width) {
-  for (int x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
-    src_argb += src_stepx * 4;
-    dst_argb += 4;
-  }
-}
-
-// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) static_cast<uint32>( \
-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
-
-static void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                                  int dst_width, int x, int dx) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
-  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
-  for (int j = 0; j < dst_width - 1; j += 2) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-    x += dx;
-    xi = x >> 16;
-    xf = (x >> 9) & 0x7f;
-    a = src[xi];
-    b = src[xi + 1];
-    dst[1] = BLENDER(a, b, xf);
-    x += dx;
-    dst += 2;
-  }
-  if (dst_width & 1) {
-    int xi = x >> 16;
-    int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
-    dst[0] = BLENDER(a, b, xf);
-  }
-}
-
 // ScaleARGB ARGB, 1/2
 // This is an optimized version for scaling down a ARGB to 1/2 of
 // its original size.
-
 static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
@@ -743,8 +38,8 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
                            FilterMode filtering) {
   assert(dx == 65536 * 2);  // Test scale factor of 2.
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
-  // Advance to odd row / even column.
-  if (filtering) {
+  // Advance to odd row, even column.
+  if (filtering == kFilterBilinear) {
     src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
   } else {
     src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
@@ -752,13 +47,16 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
   int row_stride = src_stride * (dy >> 16);
   void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
                             uint8* dst_argb, int dst_width) =
-      filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C;
+    filtering == kFilterNone ? ScaleARGBRowDown2_C :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+        ScaleARGBRowDown2Box_C);
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
-    ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 :
-        ScaleARGBRowDown2_SSE2;
+    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+        ScaleARGBRowDown2Box_SSE2);
   }
 #elif defined(HAS_SCALEARGBROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
@@ -768,7 +66,9 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
   }
 #endif
 
-  // TODO(fbarchard): Loop through source height to allow odd height.
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   for (int y = 0; y < dst_height; ++y) {
     ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
     src_argb += row_stride;
@@ -776,6 +76,49 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
   }
 }
 
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int /* src_width */, int /* src_height */,
+                              int dst_width, int dst_height,
+                              int src_stride, int dst_stride,
+                              const uint8* src_argb, uint8* dst_argb,
+                              int x, int dx, int y, int dy) {
+  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
+
+  assert(dst_width * 2 <= kMaxStride);
+  // TODO(fbarchard): Remove clip_src_width alignment checks.
+  SIMD_ALIGNED(uint8 row[kMaxStride * 2 + 16]);
+
+  // Advance to odd row, even column.
+  src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+  int row_stride = src_stride * (dy >> 16);
+  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+  }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+    ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+  }
+#endif
+
+  for (int y = 0; y < dst_height; ++y) {
+    ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+                      row + kMaxStride, dst_width * 2);
+    ScaleARGBRowDown2(row, kMaxStride, dst_argb, dst_width);
+    src_argb += row_stride;
+    dst_argb += dst_stride;
+  }
+}
+
 // ScaleARGB ARGB Even
 // This is an optimized version for scaling down a ARGB to even
 // multiple of its original size.
@@ -807,6 +150,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
   }
 #endif
 
+  if (filtering == kFilterLinear) {
+    src_stride = 0;
+  }
   for (int y = 0; y < dst_height; ++y) {
     ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
     src_argb += row_stride;
@@ -815,11 +161,13 @@ static void ScaleARGBDownEven(int src_width, int src_height,
 }
 
 // Scale ARGB down with bilinear interpolation.
+SAFEBUFFERS
 static void ScaleARGBBilinearDown(int src_height,
                                   int dst_width, int dst_height,
                                   int src_stride, int dst_stride,
                                   const uint8* src_argb, uint8* dst_argb,
-                                  int x, int dx, int y, int dy) {
+                                  int x, int dx, int y, int dy,
+                                  FilterMode filtering) {
   assert(src_height > 0);
   assert(dst_width > 0);
   assert(dst_height > 0);
@@ -859,6 +207,14 @@ static void ScaleARGBBilinearDown(int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
     InterpolateRow = InterpolateRow_Any_NEON;
@@ -883,27 +239,33 @@ static void ScaleARGBBilinearDown(int src_height,
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
-  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  const int max_y = (src_height - 1) << 16;
   for (int j = 0; j < dst_height; ++j) {
-    if (y > maxy) {
-      y = maxy;
+    if (y > max_y) {
+      y = max_y;
     }
     int yi = y >> 16;
-    int yf = (y >> 8) & 255;
     const uint8* src = src_argb + yi * src_stride;
-    InterpolateRow(row, src, src_stride, clip_src_width, yf);
-    ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+    if (filtering == kFilterLinear) {
+      ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(row, src, src_stride, clip_src_width, yf);
+      ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+    }
     dst_argb += dst_stride;
     y += dy;
   }
 }
 
 // Scale ARGB up with bilinear interpolation.
+SAFEBUFFERS
 static void ScaleARGBBilinearUp(int src_width, int src_height,
                                 int dst_width, int dst_height,
                                 int src_stride, int dst_stride,
                                 const uint8* src_argb, uint8* dst_argb,
-                                int x, int dx, int y, int dy) {
+                                int x, int dx, int y, int dy,
+                                FilterMode filtering) {
   assert(src_width > 0);
   assert(src_height > 0);
   assert(dst_width > 0);
@@ -934,6 +296,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
     InterpolateRow = InterpolateRow_Any_NEON;
@@ -949,15 +319,32 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
   }
 #endif
   void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
+      int dst_width, int x, int dx) =
+      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
+  if (filtering && TestCpuFlag(kCpuHasSSSE3)) {
     ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
   }
 #endif
-  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
-  if (y > maxy) {
-    y = maxy;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+  if (!filtering && TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+  }
+#endif
+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
+
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
   }
   int yi = y >> 16;
   const uint8* src = src_argb + yi * src_stride;
@@ -976,7 +363,12 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
   for (int j = 0; j < dst_height; ++j) {
     yi = y >> 16;
     if (yi != lasty) {
-      if (y <= maxy) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        src = src_argb + yi * src_stride;
+      }
+      if (yi != lasty) {
         ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
         rowptr += rowstride;
         rowstride = -rowstride;
@@ -984,33 +376,205 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
         src += src_stride;
       }
     }
-    int yf = (y >> 8) & 255;
-    InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
     dst_argb += dst_stride;
     y += dy;
   }
 }
 
-// Scales a single row of pixels using point sampling.
-// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
-// interpolation off, and argb pixels instead of yuv.
-static void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                            int dst_width, int x, int dx) {
-  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
-  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
-  for (int j = 0; j < dst_width - 1; j += 2) {
-    dst[0] = src[x >> 16];
-    x += dx;
-    dst[1] = src[x >> 16];
-    x += dx;
-    dst += 2;
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+SAFEBUFFERS
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+                                     int dst_width, int dst_height,
+                                     int src_stride_y,
+                                     int src_stride_u,
+                                     int src_stride_v,
+                                     int dst_stride_argb,
+                                     const uint8* src_y,
+                                     const uint8* src_u,
+                                     const uint8* src_v,
+                                     uint8* dst_argb,
+                                     int x, int dx, int y, int dy,
+                                     FilterMode filtering) {
+  assert(src_width > 0);
+  assert(src_height > 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  assert(dst_width * 4 <= kMaxStride);
+
+  void (*I422ToARGBRow)(const uint8* y_buf,
+                        const uint8* u_buf,
+                        const uint8* v_buf,
+                        uint8* rgb_buf,
+                        int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        I422ToARGBRow = I422ToARGBRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+    I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(src_width, 16)) {
+      I422ToARGBRow = I422ToARGBRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+    I422ToARGBRow = I422ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+  }
+#endif
+
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
   }
-  if (dst_width & 1) {
-    dst[0] = src[x >> 16];
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+    InterpolateRow = InterpolateRow_MIPS_DSPR2;
+  }
+#endif
+  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+      int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+  }
+#endif
+  const int max_y = (src_height - 1) << 16;
+  if (y > max_y) {
+    y = max_y;
+  }
+  const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
+  int yi = y >> 16;
+  int uv_yi = yi >> kYShift;
+  const uint8* src_row_y = src_y + yi * src_stride_y;
+  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+  SIMD_ALIGNED(uint8 row[2 * kMaxStride]);
+  SIMD_ALIGNED(uint8 argb_row[kMaxStride * 4]);
+  uint8* rowptr = row;
+  int rowstride = kMaxStride;
+  int lasty = yi;
+
+  ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+  if (src_height > 1) {
+    src_row_y += src_stride_y;
+    if (yi & 1) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+  ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+  if (src_height > 2) {
+    src_row_y += src_stride_y;
+    if (!(yi & 1)) {
+      src_row_u += src_stride_u;
+      src_row_v += src_stride_v;
+    }
+  }
+
+  for (int j = 0; j < dst_height; ++j) {
+    yi = y >> 16;
+    if (yi != lasty) {
+      if (y > max_y) {
+        y = max_y;
+        yi = y >> 16;
+        uv_yi = yi >> kYShift;
+        src_row_y = src_y + yi * src_stride_y;
+        src_row_u = src_u + uv_yi * src_stride_u;
+        src_row_v = src_v + uv_yi * src_stride_v;
+      }
+      if (yi != lasty) {
+        // TODO(fbarchard): Convert the clipped region of row.
+        I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+        ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+        rowptr += rowstride;
+        rowstride = -rowstride;
+        lasty = yi;
+        src_row_y += src_stride_y;
+        if (yi & 1) {
+          src_row_u += src_stride_u;
+          src_row_v += src_stride_v;
+        }
+      }
+    }
+    if (filtering == kFilterLinear) {
+      InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+    } else {
+      int yf = (y >> 8) & 255;
+      InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+    }
+    dst_argb += dst_stride_argb;
+    y += dy;
   }
 }
+#endif
 
-// ScaleARGB ARGB to/from any dimensions, without interpolation.
+// Scale ARGB to/from any dimensions, without interpolation.
 // Fixed point math is used for performance: The upper 16 bits
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
@@ -1027,6 +591,16 @@ static void ScaleARGBSimple(int src_width, int src_height,
     ScaleARGBCols = ScaleARGBCols_SSE2;
   }
 #endif
+  if (src_width * 2 == dst_width && x < 0x8000) {
+    ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+        IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+        IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+      ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+    }
+#endif
+  }
 
   for (int i = 0; i < dst_height; ++i) {
     ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
@@ -1036,33 +610,6 @@ static void ScaleARGBSimple(int src_width, int src_height,
   }
 }
 
-// ScaleARGB ARGB to/from any dimensions.
-static void ScaleARGBAnySize(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int clip_width, int clip_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_argb, uint8* dst_argb,
-                             int x, int dx, int y, int dy,
-                             FilterMode filtering) {
-  if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
-    ScaleARGBBilinearUp(src_width, src_height,
-                        clip_width, clip_height,
-                        src_stride, dst_stride, src_argb, dst_argb,
-                        x, dx, y, dy);
-    return;
-  }
-  if (filtering && src_width * 4 < kMaxStride) {
-    ScaleARGBBilinearDown(src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src_argb, dst_argb,
-                          x, dx, y, dy);
-    return;
-  }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                  src_stride, dst_stride, src_argb, dst_argb,
-                  x, dx, y, dy);
-}
-
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
@@ -1072,6 +619,12 @@ static void ScaleARGB(const uint8* src, int src_stride,
                       int dst_width, int dst_height,
                       int clip_x, int clip_y, int clip_width, int clip_height,
                       FilterMode filtering) {
+  // ARGB does not support box filter yet, but allow the user to pass it.
+  // Simplify filtering when possible.
+  filtering = ScaleFilterReduce(src_width, src_height,
+                                dst_width, dst_height,
+                                filtering);
+
   // Negative src_height means invert the image.
   if (src_height < 0) {
     src_height = -src_height;
@@ -1079,37 +632,12 @@ static void ScaleARGB(const uint8* src, int src_stride,
     src_stride = -src_stride;
   }
   // Initial source x/y coordinate and step values as 16.16 fixed point.
-  int dx = 0;
-  int dy = 0;
   int x = 0;
   int y = 0;
-  if (filtering) {
-    // Scale step for bilinear sampling renders last pixel once for upsample.
-    if (dst_width <= Abs(src_width)) {
-      dx = (Abs(src_width) << 16) / dst_width;
-      x = (dx >> 1) - 32768;
-    } else if (dst_width > 1) {
-      dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
-    }
-    if (dst_height <= src_height) {
-      dy = (src_height << 16) / dst_height;
-      y = (dy >> 1) - 32768;
-    } else if (dst_height > 1) {
-      dy = ((src_height - 1) << 16) / (dst_height - 1);
-    }
-  } else {
-    // Scale step for point sampling duplicates all pixels equally.
-    dx = (Abs(src_width) << 16) / dst_width;
-    dy = (src_height << 16) / dst_height;
-    x = dx >> 1;
-    y = dy >> 1;
-  }
-  // Negative src_width means horizontally mirror.
-  if (src_width < 0) {
-    x += (dst_width - 1) * dx;
-    dx = -dx;
-    src_width = -src_width;
-  }
+  int dx = 0;
+  int dy = 0;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+             &x, &y, &dx, &dy);
   if (clip_x) {
     x += clip_x * dx;
     dst += clip_x * 4;
@@ -1121,19 +649,29 @@ static void ScaleARGB(const uint8* src, int src_stride,
 
   // Special case for integer step values.
   if (((dx | dy) & 0xffff) == 0) {
-    if (!dx || !dy) {
+    if (!dx || !dy) {  // 1 pixel wide and/or tall.
       filtering = kFilterNone;
     } else {
       // Optimized even scale down. ie 2, 4, 6, 8, 10x.
       if (!(dx & 0x10000) && !(dy & 0x10000)) {
-        if ((dx >> 16) == 2) {
-          // Optimized 1/2 horizontal.
-          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+        if (dx == 0x20000) {
+          // Optimized 1/2 downsample.
+          ScaleARGBDown2(src_width, src_height,
+                         clip_width, clip_height,
                          src_stride, dst_stride, src, dst,
                          x, dx, y, dy, filtering);
           return;
         }
-        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+        if (dx == 0x40000 && filtering == kFilterBox) {
+          // Optimized 1/4 box downsample.
+          ScaleARGBDown4Box(src_width, src_height,
+                            clip_width, clip_height,
+                            src_stride, dst_stride, src, dst,
+                            x, dx, y, dy);
+          return;
+        }
+        ScaleARGBDownEven(src_width, src_height,
+                          clip_width, clip_height,
                           src_stride, dst_stride, src, dst,
                           x, dx, y, dy, filtering);
         return;
@@ -1141,7 +679,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
       // Optimized odd scale down. ie 3, 5, 7, 9x.
       if ((dx & 0x10000) && (dy & 0x10000)) {
         filtering = kFilterNone;
-        if (dst_width == src_width && dst_height == src_height) {
+        if (dx == 0x10000 && dy == 0x10000) {
           // Straight copy.
           ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
                    dst, dst_stride, clip_width, clip_height);
@@ -1150,11 +688,31 @@ static void ScaleARGB(const uint8* src, int src_stride,
       }
     }
   }
-  // Arbitrary scale up and/or down.
-  ScaleARGBAnySize(src_width, src_height,
-                   dst_width, dst_height,
-                   clip_width, clip_height,
-                   src_stride, dst_stride, src, dst, x, dx, y, dy, filtering);
+  if (dx == 0x10000 && (x & 0xffff) == 0) {
+    // Arbitrary scale vertically, but unscaled vertically.
+    ScalePlaneVertical(src_height,
+                       clip_width, clip_height,
+                       src_stride, dst_stride, src, dst,
+                       x, y, dy, 4, filtering);
+    return;
+  }
+  if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
+    ScaleARGBBilinearUp(src_width, src_height,
+                        clip_width, clip_height,
+                        src_stride, dst_stride, src, dst,
+                        x, dx, y, dy, filtering);
+    return;
+  }
+  if (filtering && src_width * 4 < kMaxStride) {
+    ScaleARGBBilinearDown(src_height,
+                          clip_width, clip_height,
+                          src_stride, dst_stride, src, dst,
+                          x, dx, y, dy, filtering);
+    return;
+  }
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+                  src_stride, dst_stride, src, dst,
+                  x, dx, y, dy);
 }
 
 LIBYUV_API
@@ -1167,7 +725,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
   if (!src_argb || src_width == 0 || src_height == 0 ||
       !dst_argb || dst_width <= 0 || dst_height <= 0 ||
       clip_x < 0 || clip_y < 0 ||
-      src_width > 32767 || src_height > 32767 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
@@ -1186,8 +743,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
               int dst_width, int dst_height,
               FilterMode filtering) {
   if (!src_argb || src_width == 0 || src_height == 0 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
-      src_width > 32767 || src_height > 32767) {
+      !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
   ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
diff --git a/chromium/third_party/libyuv/source/scale_argb_neon.cc b/chromium/third_party/libyuv/source/scale_argb_neon.cc
deleted file mode 100644
index 51b00872441..00000000000
--- a/chromium/third_party/libyuv/source/scale_argb_neon.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    "vld2.32    {q0, q1}, [%0]!                \n"
-    "vld2.32    {q2, q3}, [%0]!                \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    "vst1.8     {q3}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    "vrshrn.u16 d2, q2, #2                     \n"
-    "vrshrn.u16 d3, q3, #2                     \n"
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
-                               uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %3, lsl #2                \n"
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld1.32    {d0[0]}, [%0], r12             \n"
-    "vld1.32    {d0[1]}, [%0], r12             \n"
-    "vld1.32    {d1[0]}, [%0], r12             \n"
-    "vld1.32    {d1[1]}, [%0], r12             \n"
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"(src_stepx)     // %3
-  : "memory", "cc", "r12", "q0"
-  );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov       r12, %4, lsl #2                 \n"
-    "add       %1, %1, %0                      \n"
-    ".p2align  2                               \n"
-  "1:                                          \n"
-    "vld1.8    {d0}, [%0], r12                 \n"  // Read 4 2x2 blocks -> 2x1
-    "vld1.8    {d1}, [%1], r12                 \n"
-    "vld1.8    {d2}, [%0], r12                 \n"
-    "vld1.8    {d3}, [%1], r12                 \n"
-    "vld1.8    {d4}, [%0], r12                 \n"
-    "vld1.8    {d5}, [%1], r12                 \n"
-    "vld1.8    {d6}, [%0], r12                 \n"
-    "vld1.8    {d7}, [%1], r12                 \n"
-    "vaddl.u8  q0, d0, d1                      \n"
-    "vaddl.u8  q1, d2, d3                      \n"
-    "vaddl.u8  q2, d4, d5                      \n"
-    "vaddl.u8  q3, d6, d7                      \n"
-    "vswp.8    d1, d2                          \n"  // ab_cd -> ac_bd
-    "vswp.8    d5, d6                          \n"  // ef_gh -> eg_fh
-    "vadd.u16  q0, q0, q1                      \n"  // (a+b)_(c+d)
-    "vadd.u16  q2, q2, q3                      \n"  // (e+f)_(g+h)
-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"(src_stepx)     // %4
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
-  );
-}
-#endif  // __ARM_NEON__
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc
new file mode 100644
index 00000000000..ee6a336292c
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_common.cc
@@ -0,0 +1,657 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"  // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                     uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[1];
+    dst[1] = src_ptr[3];
+    dst += 2;
+    src_ptr += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[1];
+  }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+    dst[1] = (s[2] + s[3] + 1) >> 1;
+    dst += 2;
+    s += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + 1) >> 1;
+  }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+    dst += 2;
+    s += 4;
+    t += 4;
+  }
+  if (dst_width & 1) {
+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+  }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                     uint8* dst, int dst_width) {
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src_ptr[2];
+    dst[1] = src_ptr[6];
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = src_ptr[2];
+  }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst, int dst_width) {
+  intptr_t stride = src_stride;
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+             src_ptr[stride + 4] + src_ptr[stride + 5] +
+             src_ptr[stride + 6] + src_ptr[stride + 7] +
+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+             8) >> 4;
+    dst += 2;
+    src_ptr += 8;
+  }
+  if (dst_width & 1) {
+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+             src_ptr[stride + 0] + src_ptr[stride + 1] +
+             src_ptr[stride + 2] + src_ptr[stride + 3] +
+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+             8) >> 4;
+  }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                      uint8* dst, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  for (int x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[1];
+    dst[2] = src_ptr[3];
+    dst += 3;
+    src_ptr += 4;
+  }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  for (int x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 * 3 + b0 + 2) >> 2;
+    d[1] = (a1 * 3 + b1 + 2) >> 2;
+    d[2] = (a2 * 3 + b2 + 2) >> 2;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* d, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  const uint8* s = src_ptr;
+  const uint8* t = src_ptr + src_stride;
+  for (int x = 0; x < dst_width; x += 3) {
+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    d[0] = (a0 + b0 + 1) >> 1;
+    d[1] = (a1 + b1 + 1) >> 1;
+    d[2] = (a2 + b2 + 1) >> 1;
+    d += 3;
+    s += 4;
+    t += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                 int dst_width, int x, int dx) {
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[0] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr[1] = src_ptr[x >> 16];
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+                    int dst_width, int, int) {
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+    src_ptr += 1;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    dst_ptr[0] = src_ptr[0];
+  }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (static_cast<int>(a) + \
+    ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int x, int dx) {
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    xi = x >> 16;
+    a = src_ptr[xi];
+    b = src_ptr[xi + 1];
+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+    x += dx;
+    dst_ptr += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int a = src_ptr[xi];
+    int b = src_ptr[xi + 1];
+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+  }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                      uint8* dst, int dst_width) {
+  assert(dst_width % 3 == 0);
+  for (int x = 0; x < dst_width; x += 3) {
+    dst[0] = src_ptr[0];
+    dst[1] = src_ptr[3];
+    dst[2] = src_ptr[6];
+    dst += 3;
+    src_ptr += 8;
+  }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  intptr_t stride = src_stride;
+  for (int i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+        (65536 / 9) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+        (65536 / 9) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7] +
+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+        (65536 / 6) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                            uint8* dst_ptr, int dst_width) {
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  intptr_t stride = src_stride;
+  for (int i = 0; i < dst_width; i += 3) {
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+        src_ptr[stride + 0] + src_ptr[stride + 1] +
+        src_ptr[stride + 2]) * (65536 / 6) >> 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+        src_ptr[stride + 3] + src_ptr[stride + 4] +
+        src_ptr[stride + 5]) * (65536 / 6) >> 16;
+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+        src_ptr[stride + 6] + src_ptr[stride + 7]) *
+        (65536 / 4) >> 16;
+    src_ptr += 8;
+    dst_ptr += 3;
+  }
+}
+
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+                    uint16* dst_ptr, int src_width, int src_height) {
+  assert(src_width > 0);
+  assert(src_height > 0);
+  for (int x = 0; x < src_width; ++x) {
+    const uint8* s = src_ptr + x;
+    unsigned int sum = 0u;
+    for (int y = 0; y < src_height; ++y) {
+      sum += s[0];
+      s += src_stride;
+    }
+    // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
+    dst_ptr[x] = sum < 65535u ? sum : 65535u;
+  }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+                         ptrdiff_t /* src_stride */,
+                         uint8* dst_argb, int dst_width) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[1];
+    dst[1] = src[3];
+    src += 4;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[1];
+  }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                               ptrdiff_t /* src_stride */,
+                               uint8* dst_argb, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+                            uint8* dst_argb, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += 8;
+    dst_argb += 4;
+  }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
+                            int src_stepx,
+                            uint8* dst_argb, int dst_width) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+
+  for (int x = 0; x < dst_width - 1; x += 2) {
+    dst[0] = src[0];
+    dst[1] = src[src_stepx];
+    src += src_stepx * 2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  for (int x = 0; x < dst_width; ++x) {
+    dst_argb[0] = (src_argb[0] + src_argb[4] +
+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] +
+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] +
+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] +
+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    src_argb += src_stepx * 4;
+    dst_argb += 4;
+  }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+                     int dst_width, int x, int dx) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst[0] = src[x >> 16];
+    x += dx;
+    dst[1] = src[x >> 16];
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[x >> 16];
+  }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int, int) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    dst[1] = dst[0] = src[0];
+    src += 1;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    dst[0] = src[0];
+  }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) static_cast<uint32>( \
+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int x, int dx) {
+  const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+  uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+  for (int j = 0; j < dst_width - 1; j += 2) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+    x += dx;
+    xi = x >> 16;
+    xf = (x >> 9) & 0x7f;
+    a = src[xi];
+    b = src[xi + 1];
+    dst[1] = BLENDER(a, b, xf);
+    x += dx;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    int xi = x >> 16;
+    int xf = (x >> 9) & 0x7f;
+    uint32 a = src[xi];
+    uint32 b = src[xi + 1];
+    dst[0] = BLENDER(a, b, xf);
+  }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+                        int dst_width, int dst_height,
+                        int src_stride, int dst_stride,
+                        const uint8* src_argb, uint8* dst_argb,
+                        int x, int y, int dy,
+                        int bpp, FilterMode filtering) {
+  // TODO(fbarchard): Allow higher bpp.
+  assert(bpp >= 1 && bpp <= 4);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  int dst_width_bytes = dst_width * bpp;
+  src_argb += (x >> 16) * bpp;
+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+      InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSE2;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSE2;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSE2;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+    InterpolateRow = InterpolateRow_Any_SSSE3;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+        InterpolateRow = InterpolateRow_SSSE3;
+      }
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+    InterpolateRow = InterpolateRow_Any_AVX2;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+    InterpolateRow = InterpolateRow_Any_NEON;
+    if (IS_ALIGNED(dst_width_bytes, 16)) {
+      InterpolateRow = InterpolateRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+    if (IS_ALIGNED(dst_width_bytes, 4)) {
+      InterpolateRow = InterpolateRow_MIPS_DSPR2;
+    }
+  }
+#endif
+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+  for (int j = 0; j < dst_height; ++j) {
+    if (y > max_y) {
+      y = max_y;
+    }
+    int yi = y >> 16;
+    int yf = filtering ? ((y >> 8) & 255) : 0;
+    const uint8* src = src_argb + yi * src_stride;
+    InterpolateRow(dst_argb, src, src_stride, dst_width_bytes, yf);
+    dst_argb += dst_stride;
+    y += dy;
+  }
+}
+
+// Simplify the filtering based on scale factors.
+FilterMode ScaleFilterReduce(int src_width, int src_height,
+                             int dst_width, int dst_height,
+                             FilterMode filtering) {
+  if (src_width < 0) {
+    src_width = -src_width;
+  }
+  if (src_height < 0) {
+    src_height = -src_height;
+  }
+  if (filtering == kFilterBox) {
+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+      filtering = kFilterBilinear;
+    }
+    // If scaling to larger, switch from Box to Bilinear.
+    if (dst_width >= src_width || dst_height >= src_height) {
+      filtering = kFilterBilinear;
+    }
+  }
+  if (filtering == kFilterBilinear) {
+    if (src_height == 1) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+    if (dst_height == src_height || dst_height * 3 == src_height) {
+      filtering = kFilterLinear;
+    }
+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+    // avoid reading 2 pixels horizontally that causes memory exception.
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+  }
+  if (filtering == kFilterLinear) {
+    if (src_width == 1) {
+      filtering = kFilterNone;
+    }
+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+    if (dst_width == src_width || dst_width * 3 == src_width) {
+      filtering = kFilterNone;
+    }
+  }
+  return filtering;
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+#define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \
+                                     (dst << 16) - 0x00010000);
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+                int dst_width, int dst_height,
+                FilterMode filtering,
+                int* x, int* y, int* dx, int* dy) {
+  assert(x != NULL);
+  assert(y != NULL);
+  assert(dx != NULL);
+  assert(dy != NULL);
+  assert(src_width != 0);
+  assert(src_height != 0);
+  assert(dst_width > 0);
+  assert(dst_height > 0);
+  if (filtering == kFilterBox) {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = 0;
+    *y = 0;
+  } else if (filtering == kFilterBilinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FIXEDDIV1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    if (dst_height <= src_height) {
+      *dy = FixedDiv(src_height,  dst_height);
+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_height > 1) {
+      *dy = FIXEDDIV1(src_height, dst_height);
+      *y = 0;
+    }
+  } else if (filtering == kFilterLinear) {
+    // Scale step for bilinear sampling renders last pixel once for upsample.
+    if (dst_width <= Abs(src_width)) {
+      *dx = FixedDiv(Abs(src_width), dst_width);
+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
+    } else if (dst_width > 1) {
+      *dx = FIXEDDIV1(Abs(src_width), dst_width);
+      *x = 0;
+    }
+    *dy = FixedDiv(src_height, dst_height);
+    *y = *dy >> 1;
+  } else {
+    // Scale step for point sampling duplicates all pixels equally.
+    *dx = FixedDiv(Abs(src_width), dst_width);
+    *dy = FixedDiv(src_height, dst_height);
+    *x = CENTERSTART(*dx, 0);
+    *y = CENTERSTART(*dy, 0);
+  }
+  // Negative src_width means horizontally mirror.
+  if (src_width < 0) {
+    *x += (dst_width - 1) * *dx;
+    *dx = -*dx;
+    src_width = -src_width;
+  }
+}
+#undef CENTERSTART
+#undef FIXEDDIV1
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_mips.cc b/chromium/third_party/libyuv/source/scale_mips.cc
index cfd48b5b053..de94560959e 100644
--- a/chromium/third_party/libyuv/source/scale_mips.cc
+++ b/chromium/third_party/libyuv/source/scale_mips.cc
@@ -30,6 +30,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
     "beqz           $t9, 2f                        \n"
     " nop                                          \n"
 
+    ".p2align       2                              \n"
   "1:                                              \n"
     "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
     "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
@@ -88,6 +89,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
     "bltz           $t9, 2f                       \n"
     " nop                                         \n"
 
+    ".p2align       2                             \n"
   "1:                                             \n"
     "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
     "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
@@ -176,7 +178,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
-                                     uint8* dst, int dst_width) {
+                              uint8* dst, int dst_width) {
   __asm__ __volatile__ (
       ".set push                                    \n"
       ".set noreorder                               \n"
@@ -185,6 +187,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
       "beqz           $t9, 2f                       \n"
       " nop                                         \n"
 
+      ".p2align       2                             \n"
      "1:                                            \n"
       "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
       "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
@@ -231,7 +234,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
 }
 
 void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                        uint8* dst, int dst_width) {
+                                 uint8* dst, int dst_width) {
   intptr_t stride = src_stride;
   const uint8* s1 = src_ptr + stride;
   const uint8* s2 = s1 + stride;
@@ -244,6 +247,7 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
       "srl           $t9, %[dst_width], 1         \n"
       "andi          $t8, %[dst_width], 1         \n"
 
+      ".p2align      2                            \n"
      "1:                                          \n"
       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
@@ -314,6 +318,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   __asm__ __volatile__ (
       ".set push                                          \n"
       ".set noreorder                                     \n"
+      ".p2align        2                                  \n"
     "1:                                                   \n"
       "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
       "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@@ -360,7 +365,9 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   __asm__ __volatile__ (
       ".set push                                         \n"
       ".set noreorder                                    \n"
-      "repl.ph          $t3, 3                           \n"  // 0x00030003
+      "repl.ph           $t3, 3                          \n"  // 0x00030003
+
+     ".p2align           2                               \n"
     "1:                                                  \n"
       "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
       "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
@@ -416,6 +423,8 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
       ".set push                                           \n"
       ".set noreorder                                      \n"
       "repl.ph           $t2, 3                            \n"  // 0x00030003
+
+      ".p2align          2                                 \n"
     "1:                                                    \n"
       "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
       "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
@@ -466,6 +475,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
   __asm__ __volatile__ (
       ".set push                                     \n"
       ".set noreorder                                \n"
+
+      ".p2align   2                                  \n"
     "1:                                              \n"
       "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
       "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
@@ -515,6 +526,8 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   __asm__ __volatile__ (
       ".set push                                         \n"
       ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
     "1:                                                  \n"
       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
@@ -571,6 +584,8 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
   __asm__ __volatile__ (
       ".set push                                         \n"
       ".set noreorder                                    \n"
+
+      ".p2align        2                                 \n"
     "1:                                                  \n"
       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc
index a370349a72f..c9c6b2cdf88 100644
--- a/chromium/third_party/libyuv/source/scale_neon.cc
+++ b/chromium/third_party/libyuv/source/scale_neon.cc
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/basic_types.h"
 #include "libyuv/row.h"
 
 #ifdef __cplusplus
@@ -16,7 +15,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// This module is for GCC Neon
+// This module is for GCC Neon.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
 
 // NEON downscalers with interpolation.
@@ -25,6 +24,7 @@ extern "C" {
 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                         uint8* dst, int dst_width) {
   asm volatile (
+    ".p2align   2                              \n"
   "1:                                          \n"
     // load even pixels into q0, odd into q1
     "vld2.8     {q0, q1}, [%0]!                \n"
@@ -44,6 +44,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
   asm volatile (
     // change the stride to row 2 pointer
     "add        %1, %0                         \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
     "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
@@ -68,11 +69,12 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
                         uint8* dst_ptr, int dst_width) {
   asm volatile (
+    ".p2align   2                              \n"
   "1:                                          \n"
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "subs         %2, %2, #8                   \n" // 8 processed per loop
-    "vst1.8       {d2}, [%1]!                  \n"
-    "bgt          1b                           \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
+    "subs       %2, %2, #8                     \n" // 8 processed per loop
+    "vst1.8     {d2}, [%1]!                    \n"
+    "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width)         // %2
@@ -87,6 +89,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
     "add        r4, %0, %3                     \n"
     "add        r5, r4, %3                     \n"
     "add        %3, r5, %3                     \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
     "vld1.8     {q1}, [r4]!                    \n"
@@ -117,12 +120,13 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
                          ptrdiff_t /* src_stride */,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
+    ".p2align   2                              \n"
   "1:                                          \n"
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "subs         %2, %2, #24                  \n"
-    "vmov         d2, d3                       \n" // order d0, d1, d2
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-    "bgt          1b                           \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
+    "subs       %2, %2, #24                  \n"
+    "vmov       d2, d3                       \n" // order d0, d1, d2
+    "vst3.8     {d0, d1, d2}, [%1]!          \n"
+    "bgt        1b                           \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width)         // %2
@@ -135,8 +139,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vmov.u8      d24, #3                      \n"
-    "add          %3, %0                       \n"
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
@@ -191,8 +196,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vmov.u8      d24, #3                      \n"
-    "add          %3, %0                       \n"
+    "vmov.u8    d24, #3                        \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
     "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
     "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
@@ -226,14 +232,14 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
 }
 
 #define HAS_SCALEROWDOWN38_NEON
-const uvec8 kShuf38 =
+static uvec8 kShuf38 =
   { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-const uvec8 kShuf38_2 =
+static uvec8 kShuf38_2 =
   { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-const vec16 kMult38_Div6 =
+static vec16 kMult38_Div6 =
   { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
     65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-const vec16 kMult38_Div9 =
+static vec16 kMult38_Div9 =
   { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
     65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
 
@@ -242,15 +248,16 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
                          ptrdiff_t /* src_stride */,
                          uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vld1.8       {q3}, [%3]                   \n"
+    "vld1.8     {q3}, [%3]                     \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
-    "vld1.8       {d0, d1, d2, d3}, [%0]!      \n"
-    "subs         %2, %2, #12                  \n"
-    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
-    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
-    "vst1.8       {d4}, [%1]!                  \n"
-    "vst1.32      {d5[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+    "subs       %2, %2, #12                    \n"
+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+    "vst1.8     {d4}, [%1]!                    \n"
+    "vst1.32    {d5[0]}, [%1]!                 \n"
+    "bgt        1b                             \n"
   : "+r"(src_ptr),          // %0
     "+r"(dst_ptr),          // %1
     "+r"(dst_width)         // %2
@@ -264,11 +271,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
                                       ptrdiff_t src_stride,
                                       uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vld1.16      {q13}, [%4]                  \n"
-    "vld1.8       {q14}, [%5]                  \n"
-    "vld1.8       {q15}, [%6]                  \n"
-    "add          r4, %0, %3, lsl #1           \n"
-    "add          %3, %0                       \n"
+    "vld1.16    {q13}, [%4]                    \n"
+    "vld1.8     {q14}, [%5]                    \n"
+    "vld1.8     {q15}, [%6]                    \n"
+    "add        r4, %0, %3, lsl #1             \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
 
     // d0 = 00 40 01 41 02 42 03 43
@@ -374,9 +382,10 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
                                ptrdiff_t src_stride,
                                uint8* dst_ptr, int dst_width) {
   asm volatile (
-    "vld1.16      {q13}, [%4]                  \n"
-    "vld1.8       {q14}, [%5]                  \n"
-    "add          %3, %0                       \n"
+    "vld1.16    {q13}, [%4]                    \n"
+    "vld1.8     {q14}, [%5]                    \n"
+    "add        %3, %0                         \n"
+    ".p2align   2                              \n"
   "1:                                          \n"
 
     // d0 = 00 40 01 41 02 42 03 43
@@ -546,6 +555,125 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
   : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
   );
 }
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+                            uint8* dst, int dst_width) {
+  asm volatile (
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    // load even pixels into q0, odd into q1
+    "vld2.32    {q0, q1}, [%0]!                \n"
+    "vld2.32    {q2, q3}, [%0]!                \n"
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop
+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+    "vst1.8     {q3}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(dst),              // %1
+    "+r"(dst_width)         // %2
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+  );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+                               uint8* dst, int dst_width) {
+  asm volatile (
+    // change the stride to row 2 pointer
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
+    "vrshrn.u16 d1, q1, #2                     \n"
+    "vrshrn.u16 d2, q2, #2                     \n"
+    "vrshrn.u16 d3, q3, #2                     \n"
+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+    "bgt        1b                             \n"
+  : "+r"(src_ptr),          // %0
+    "+r"(src_stride),       // %1
+    "+r"(dst),              // %2
+    "+r"(dst_width)         // %3
+  :
+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %3, lsl #2                \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "vld1.32    {d0[0]}, [%0], r12             \n"
+    "vld1.32    {d0[1]}, [%0], r12             \n"
+    "vld1.32    {d1[0]}, [%0], r12             \n"
+    "vld1.32    {d1[1]}, [%0], r12             \n"
+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+    "vst1.8     {q0}, [%1]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(dst_argb),    // %1
+    "+r"(dst_width)    // %2
+  : "r"(src_stepx)     // %3
+  : "memory", "cc", "r12", "q0"
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    "mov        r12, %4, lsl #2                \n"
+    "add        %1, %1, %0                     \n"
+    ".p2align   2                              \n"
+  "1:                                          \n"
+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
+    "vld1.8     {d1}, [%1], r12                \n"
+    "vld1.8     {d2}, [%0], r12                \n"
+    "vld1.8     {d3}, [%1], r12                \n"
+    "vld1.8     {d4}, [%0], r12                \n"
+    "vld1.8     {d5}, [%1], r12                \n"
+    "vld1.8     {d6}, [%0], r12                \n"
+    "vld1.8     {d7}, [%1], r12                \n"
+    "vaddl.u8   q0, d0, d1                     \n"
+    "vaddl.u8   q1, d2, d3                     \n"
+    "vaddl.u8   q2, d4, d5                     \n"
+    "vaddl.u8   q3, d6, d7                     \n"
+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+    "vst1.8     {q0}, [%2]!                    \n"
+    "bgt        1b                             \n"
+  : "+r"(src_argb),    // %0
+    "+r"(src_stride),  // %1
+    "+r"(dst_argb),    // %2
+    "+r"(dst_width)    // %3
+  : "r"(src_stepx)     // %4
+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+  );
+}
+
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_posix.cc b/chromium/third_party/libyuv/source/scale_posix.cc
new file mode 100644
index 00000000000..a777bfde1cc
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_posix.cc
@@ -0,0 +1,1337 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// TODO(nfullagar): For Native Client: When new toolchain becomes available,
+// take advantage of bundle lock / unlock feature. This will reduce the amount
+// of manual bundle alignment done below, and bundle alignment could even be
+// moved into each macro that doesn't use %%nacl: such as MEMOPREG.
+
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " %%" #reg ",(%%r15,%%r14)\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+    #opcode " (%%r15,%%r14)"
+#define BUNDLEALIGN ".p2align 5\n"
+#else
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+    #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+    #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+    #opcode " " #offset "(%" #base ",%" #index "," #scale ")"
+#define BUNDLEALIGN
+#endif
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t,
+                              uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2
+    BUNDLEALIGN
+    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+                                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $0x8,%%xmm5                     \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
+    BUNDLEALIGN
+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm5,%%xmm2                   \n"
+    "pand      %%xmm5,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "sub       $0x10,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+  );
+}
+
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrld     $0x18,%%xmm5                    \n"
+    "pslld     $0x10,%%xmm5                    \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pand      %%xmm5,%%xmm0                   \n"
+    "pand      %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm5"
+#endif
+  );
+}
+
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  intptr_t stridex3 = 0;
+  asm volatile (
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"
+    "psrlw     $0x8,%%xmm7                     \n"
+    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
+    BUNDLEALIGN
+    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2
+    BUNDLEALIGN
+    MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3
+    MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4
+    MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm4,%%xmm2                   \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm5,%%xmm3                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "movdqa    %%xmm1,%%xmm3                   \n"
+    "psrlw     $0x8,%%xmm1                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pand      %%xmm7,%%xmm3                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "pavgw     %%xmm3,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm0                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "psrlw     $0x8,%%xmm0                     \n"
+    "pand      %%xmm7,%%xmm2                   \n"
+    "pavgw     %%xmm2,%%xmm0                   \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x8,1) ",%1            \n"
+    "sub       $0x8,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width),   // %2
+    "+r"(stridex3)     // %3
+  : "r"(static_cast<intptr_t>(src_stride))    // %4
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+#endif
+  );
+}
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm3                       \n"
+    "movdqa    %1,%%xmm4                       \n"
+    "movdqa    %2,%%xmm5                       \n"
+  :
+  : "m"(kShuf0),  // %0
+    "m"(kShuf1),  // %1
+    "m"(kShuf2)   // %2
+  );
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "palignr   $0x8,%%xmm0,%%xmm1              \n"
+    "pshufb    %%xmm3,%%xmm0                   \n"
+    "pshufb    %%xmm4,%%xmm1                   \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "r"(static_cast<intptr_t>(src_stride)),  // %3
+    "m"(kMadd21)     // %4
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"  // kShuf01
+    "movdqa    %1,%%xmm3                       \n"  // kShuf11
+    "movdqa    %2,%%xmm4                       \n"  // kShuf21
+  :
+  : "m"(kShuf01),  // %0
+    "m"(kShuf11),  // %1
+    "m"(kShuf21)   // %2
+  );
+  asm volatile (
+    "movdqa    %0,%%xmm5                       \n"  // kMadd01
+    "movdqa    %1,%%xmm0                       \n"  // kMadd11
+    "movdqa    %2,%%xmm1                       \n"  // kRound34
+  :
+  : "m"(kMadd01),  // %0
+    "m"(kMadd11),  // %1
+    "m"(kRound34)  // %2
+  );
+
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
+    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "pmaddubsw %%xmm5,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS(1) "         \n"
+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "pmaddubsw %%xmm0,%%xmm6                   \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
+    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm6,%%xmm7                   \n"
+    "pavgb     %%xmm7,%%xmm6                   \n"
+    "pshufb    %%xmm4,%%xmm6                   \n"
+    "pmaddubsw %4,%%xmm6                       \n"
+    "paddsw    %%xmm1,%%xmm6                   \n"
+    "psrlw     $0x2,%%xmm6                     \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x18,1) ",%1           \n"
+    "sub       $0x18,%2                        \n"
+    "jg        1b                              \n"
+    : "+r"(src_ptr),   // %0
+      "+r"(dst_ptr),   // %1
+      "+r"(dst_width)  // %2
+    : "r"(static_cast<intptr_t>(src_stride)),  // %3
+      "m"(kMadd21)     // %4
+    : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %3,%%xmm4                       \n"
+    "movdqa    %4,%%xmm5                       \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "paddusb   %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(1) "         \n"
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
+    "lea       " MEMLEA(0xc,1) ",%1            \n"
+    "sub       $0xc,%2                         \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),   // %0
+    "+r"(dst_ptr),   // %1
+    "+r"(dst_width)  // %2
+  : "m"(kShuf38a),   // %3
+    "m"(kShuf38b)    // %4
+  : "memory", "cc"
+#if defined(__SSE2__)
+      , "xmm0", "xmm1", "xmm4", "xmm5"
+#endif
+  );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "movdqa    %3,%%xmm5                       \n"
+  :
+  : "m"(kShufAb0),   // %0
+    "m"(kShufAb1),   // %1
+    "m"(kShufAb2),   // %2
+    "m"(kScaleAb2)   // %3
+  );
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "pshufb    %%xmm2,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm3,%%xmm6                   \n"
+    "paddusw   %%xmm6,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "paddusw   %%xmm0,%%xmm1                   \n"
+    "pmulhuw   %%xmm5,%%xmm1                   \n"
+    "packuswb  %%xmm1,%%xmm1                   \n"
+    "sub       $0x6,%2                         \n"
+    "movd      %%xmm1," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm1                    \n"
+    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(dst_width)    // %2
+  : "r"(static_cast<intptr_t>(src_stride))  // %3
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  asm volatile (
+    "movdqa    %0,%%xmm2                       \n"
+    "movdqa    %1,%%xmm3                       \n"
+    "movdqa    %2,%%xmm4                       \n"
+    "pxor      %%xmm5,%%xmm5                   \n"
+  :
+  : "m"(kShufAc),    // %0
+    "m"(kShufAc3),   // %1
+    "m"(kScaleAc33)  // %2
+  );
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6
+    "movhlps   %%xmm0,%%xmm1                   \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm0                   \n"
+    "punpcklbw %%xmm5,%%xmm1                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6
+    "lea       " MEMLEA(0x10,0) ",%0           \n"
+    "movhlps   %%xmm6,%%xmm7                   \n"
+    "punpcklbw %%xmm5,%%xmm6                   \n"
+    "punpcklbw %%xmm5,%%xmm7                   \n"
+    "paddusw   %%xmm6,%%xmm0                   \n"
+    "paddusw   %%xmm7,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "psrldq    $0x2,%%xmm0                     \n"
+    "paddusw   %%xmm0,%%xmm6                   \n"
+    "pshufb    %%xmm2,%%xmm6                   \n"
+    "movdqa    %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "psrldq    $0x2,%%xmm1                     \n"
+    "paddusw   %%xmm1,%%xmm7                   \n"
+    "pshufb    %%xmm3,%%xmm7                   \n"
+    "paddusw   %%xmm7,%%xmm6                   \n"
+    "pmulhuw   %%xmm4,%%xmm6                   \n"
+    "packuswb  %%xmm6,%%xmm6                   \n"
+    "sub       $0x6,%2                         \n"
+    "movd      %%xmm6," MEMACCESS(1) "         \n"
+    "psrlq     $0x10,%%xmm6                    \n"
+    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
+    "lea       " MEMLEA(0x6,1) ",%1            \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),    // %0
+    "+r"(dst_ptr),    // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+  );
+}
+
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width, int src_height) {
+  int tmp_height = 0;
+  intptr_t tmp_src = 0;
+  asm volatile (
+    "pxor      %%xmm4,%%xmm4                   \n"
+    "sub       $0x1,%5                         \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "mov       %0,%3                           \n"
+    "add       %6,%0                           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm4,%%xmm0                   \n"
+    "punpckhbw %%xmm4,%%xmm1                   \n"
+    "mov       %5,%2                           \n"
+    "test      %2,%2                           \n"
+    "je        3f                              \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "2:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
+    "add       %6,%0                           \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "sub       $0x1,%2                         \n"
+    "jg        2b                              \n"
+    ".p2align  2                               \n"
+  "3:                                          \n"
+    BUNDLEALIGN
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
+    "lea       " MEMLEA(0x10,3) ",%0           \n"
+    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "sub       $0x10,%4                        \n"
+    "jg        1b                              \n"
+  : "+r"(src_ptr),     // %0
+    "+r"(dst_ptr),     // %1
+    "+r"(tmp_height),  // %2
+    "+r"(tmp_src),     // %3
+    "+r"(src_width),   // %4
+    "+rm"(src_height)  // %5
+  : "rm"(static_cast<intptr_t>(src_stride))  // %6
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+  asm volatile (
+    "movd      %6,%%xmm2                       \n"
+    "movd      %7,%%xmm3                       \n"
+    "movl      $0x04040000,%k2                 \n"
+    "movd      %k2,%%xmm5                      \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "subl      $0x2,%5                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOP(movzwl,0x00,1,3,1) ",%k2             \n"  //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm1                     \n"
+    BUNDLEALIGN
+    MEMOP(movzwl,0x00,1,4,1) ",%k2             \n"  //  movzwl  (%1,%4,1),%k2
+    "movd      %k2,%%xmm4                      \n"
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "punpcklwd %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %w2," MEMACCESS(0) "            \n"
+    "lea       " MEMLEA(0x2,0) ",%0            \n"
+    "sub       $0x2,%5                         \n"
+    "jge       2b                              \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "29:                                         \n"
+    "addl      $0x1,%5                         \n"
+    "jl        99f                             \n"
+    MEMOP(movzwl,0x00,1,3,1) ",%k2             \n"  //  movzwl  (%1,%3,1),%k2
+    "movd      %k2,%%xmm0                      \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0,%k2                      \n"
+    "mov       %b2," MEMACCESS(0) "            \n"
+  "99:                                         \n"
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+a"(temp_pixel),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1),          // %4
+    "+rm"(dst_width)   // %5
+  : "rm"(x),           // %6
+    "rm"(dx)           // %7
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int /* x */, int /* dx */) {
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpcklbw %%xmm0,%%xmm0                   \n"
+    "punpckhbw %%xmm1,%%xmm1                   \n"
+    "sub       $0x20,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_ptr),     // %0
+    "+r"(src_ptr),     // %1
+    "+r"(dst_width)    // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t /* src_stride */,
+                            uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t /* src_stride */,
+                                  uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),  // %0
+    "+r"(dst_argb),  // %1
+    "+r"(dst_width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
+    BUNDLEALIGN
+    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2
+    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(dst_width)   // %2
+  : "r"(static_cast<intptr_t>(src_stride))   // %3
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movd      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    BUNDLEALIGN
+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "punpckldq %%xmm3,%%xmm2                   \n"
+    "punpcklqdq %%xmm2,%%xmm0                  \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),      // %0
+    "+r"(src_stepx_x4),  // %1
+    "+r"(dst_argb),      // %2
+    "+r"(dst_width),     // %3
+    "+r"(src_stepx_x12)  // %4
+  :
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride, int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+  intptr_t src_stepx_x12 = 0;
+  intptr_t row1 = static_cast<intptr_t>(src_stride);
+  asm volatile (
+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
+    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movq      " MEMACCESS(0) ",%%xmm0         \n"
+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
+    BUNDLEALIGN
+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
+    "movq      " MEMACCESS(5) ",%%xmm2         \n"
+    BUNDLEALIGN
+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
+    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "pavgb     %%xmm3,%%xmm1                   \n"
+    "movdqa    %%xmm0,%%xmm2                   \n"
+    "shufps    $0x88,%%xmm1,%%xmm0             \n"
+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+    "pavgb     %%xmm2,%%xmm0                   \n"
+    "sub       $0x4,%3                         \n"
+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "jg        1b                              \n"
+  : "+r"(src_argb),       // %0
+    "+r"(src_stepx_x4),   // %1
+    "+r"(dst_argb),       // %2
+    "+rm"(dst_width),     // %3
+    "+r"(src_stepx_x12),  // %4
+    "+r"(row1)            // %5
+  :
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+  );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+    "paddd     %%xmm0,%%xmm2                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "cmp       $0x0,%4                         \n"
+    "jl        99f                             \n"
+    "sub       $0x4,%4                         \n"
+    "jl        49f                             \n"
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "40:                                         \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "pextrw    $0x7,%%xmm2,%k1                 \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
+    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
+    "pextrw    $0x1,%%xmm2,%k0                 \n"
+    "pextrw    $0x3,%%xmm2,%k1                 \n"
+    "punpckldq %%xmm4,%%xmm1                   \n"
+    "punpcklqdq %%xmm1,%%xmm0                  \n"
+    "sub       $0x4,%4                         \n"
+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x10,2) ",%2           \n"
+    "jge       40b                             \n"
+
+  "49:                                         \n"
+    "test      $0x2,%4                         \n"
+    "je        29f                             \n"
+    BUNDLEALIGN
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
+    "pextrw    $0x5,%%xmm2,%k0                 \n"
+    "punpckldq %%xmm1,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(2) "         \n"
+    "lea       " MEMLEA(0x8,2) ",%2            \n"
+  "29:                                         \n"
+    "test      $0x1,%4                         \n"
+    "je        99f                             \n"
+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
+    "movd      %%xmm0," MEMACCESS(2) "         \n"
+  "99:                                         \n"
+  : "+a"(x0),          // %0
+    "+d"(x1),          // %1
+    "+r"(dst_argb),    // %2
+    "+r"(src_argb),    // %3
+    "+r"(dst_width)    // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int /* x */, int /* dx */) {
+  asm volatile (
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "1:                                          \n"
+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
+    "lea       " MEMLEA(0x10,1) ",%1           \n"
+    "movdqa    %%xmm0,%%xmm1                   \n"
+    "punpckldq %%xmm0,%%xmm0                   \n"
+    "punpckhdq %%xmm1,%%xmm1                   \n"
+    "sub       $0x8,%2                         \n"
+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
+    "lea       " MEMLEA(0x20,0) ",%0           \n"
+    "jg        1b                              \n"
+
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+r"(dst_width)    // %2
+  :
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1"
+#endif
+  );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  intptr_t x0 = 0, x1 = 0;
+  asm volatile (
+    "movdqa    %0,%%xmm4                       \n"
+    "movdqa    %1,%%xmm5                       \n"
+  :
+  : "m"(kShuffleColARGB),  // %0
+    "m"(kShuffleFractions)  // %1
+  );
+
+  asm volatile (
+    "movd      %5,%%xmm2                       \n"
+    "movd      %6,%%xmm3                       \n"
+    "pcmpeqb   %%xmm6,%%xmm6                   \n"
+    "psrlw     $0x9,%%xmm6                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "sub       $0x2,%2                         \n"
+    "jl        29f                             \n"
+    "movdqa    %%xmm2,%%xmm0                   \n"
+    "paddd     %%xmm3,%%xmm0                   \n"
+    "punpckldq %%xmm0,%%xmm2                   \n"
+    "punpckldq %%xmm3,%%xmm3                   \n"
+    "paddd     %%xmm3,%%xmm3                   \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "2:                                          \n"
+    "movdqa    %%xmm2,%%xmm1                   \n"
+    "paddd     %%xmm3,%%xmm2                   \n"
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "psrlw     $0x9,%%xmm1                     \n"
+    BUNDLEALIGN
+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm1                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm1                   \n"
+    "pmaddubsw %%xmm1,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "pextrw    $0x1,%%xmm2,%k3                 \n"
+    "pextrw    $0x3,%%xmm2,%k4                 \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movq      %%xmm0," MEMACCESS(0) "         \n"
+    "lea       " MEMLEA(0x8,0) ",%0            \n"
+    "sub       $0x2,%2                         \n"
+    "jge       2b                              \n"
+
+    ".p2align  2                               \n"
+    BUNDLEALIGN
+  "29:                                         \n"
+    "add       $0x1,%2                         \n"
+    "jl        99f                             \n"
+    "psrlw     $0x9,%%xmm2                     \n"
+    BUNDLEALIGN
+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
+    "pshufb    %%xmm5,%%xmm2                   \n"
+    "pshufb    %%xmm4,%%xmm0                   \n"
+    "pxor      %%xmm6,%%xmm2                   \n"
+    "pmaddubsw %%xmm2,%%xmm0                   \n"
+    "psrlw     $0x7,%%xmm0                     \n"
+    "packuswb  %%xmm0,%%xmm0                   \n"
+    "movd      %%xmm0," MEMACCESS(0) "         \n"
+
+    ".p2align  2                               \n"
+  "99:                                         \n"
+  : "+r"(dst_argb),    // %0
+    "+r"(src_argb),    // %1
+    "+rm"(dst_width),  // %2
+    "+r"(x0),          // %3
+    "+r"(x1)           // %4
+  : "rm"(x),           // %5
+    "rm"(dx)           // %6
+  : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+    , "r14"
+#endif
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+  );
+}
+
+#endif  // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_win.cc b/chromium/third_party/libyuv/source/scale_win.cc
new file mode 100644
index 00000000000..76f5f4b4b4f
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_win.cc
@@ -0,0 +1,1289 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+  { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                              uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    sub        ecx, 16
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+
+    align      4
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm1, 8
+    packuswb   xmm0, xmm1
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+                                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      4
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    psrlw      xmm5, 8
+
+    align      4
+  wloop:
+    movdqu     xmm0, [eax]
+    movdqu     xmm1, [eax + 16]
+    movdqu     xmm2, [eax + esi]
+    movdqu     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm5
+    pand       xmm3, xmm5
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    sub        ecx, 16
+    movdqu     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                        uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    psrld      xmm5, 24
+    pslld      xmm5, 16
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    pand       xmm0, xmm5
+    pand       xmm1, xmm5
+    packuswb   xmm0, xmm1
+    psrlw      xmm0, 8
+    packuswb   xmm0, xmm0
+    sub        ecx, 8
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                           uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_ptr
+    mov        esi, [esp + 8 + 8]    // src_stride
+    mov        edx, [esp + 8 + 12]   // dst_ptr
+    mov        ecx, [esp + 8 + 16]   // dst_width
+    lea        edi, [esi + esi * 2]  // src_stride * 3
+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
+    psrlw      xmm7, 8
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, [eax + esi * 2]
+    movdqa     xmm3, [eax + esi * 2 + 16]
+    movdqa     xmm4, [eax + edi]
+    movdqa     xmm5, [eax + edi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm2, xmm4
+    pavgb      xmm3, xmm5
+    pavgb      xmm0, xmm2
+    pavgb      xmm1, xmm3
+
+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
+    psrlw      xmm0, 8
+    movdqa     xmm3, xmm1
+    psrlw      xmm1, 8
+    pand       xmm2, xmm7
+    pand       xmm3, xmm7
+    pavgw      xmm0, xmm2
+    pavgw      xmm1, xmm3
+    packuswb   xmm0, xmm1
+
+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
+    psrlw      xmm0, 8
+    pand       xmm2, xmm7
+    pavgw      xmm0, xmm2
+    packuswb   xmm0, xmm0
+
+    sub        ecx, 8
+    movq       qword ptr [edx], xmm0
+    lea        edx, [edx + 8]
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm3, kShuf0
+    movdqa     xmm4, kShuf1
+    movdqa     xmm5, kShuf2
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm1
+    palignr    xmm1, xmm0, 8
+    pshufb     xmm0, xmm3
+    pshufb     xmm1, xmm4
+    pshufb     xmm2, xmm5
+    movq       qword ptr [edx], xmm0
+    movq       qword ptr [edx + 8], xmm1
+    movq       qword ptr [edx + 16], xmm2
+    lea        edx, [edx + 24]
+    sub        ecx, 24
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]           // pixels 0..7
+    movdqa     xmm1, [eax + esi]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqa     xmm0, [eax + 16]      // pixels 16..23
+    movdqa     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    sub        ecx, 24
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx + 24]
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShuf01
+    movdqa     xmm3, kShuf11
+    movdqa     xmm4, kShuf21
+    movdqa     xmm5, kMadd01
+    movdqa     xmm6, kMadd11
+    movdqa     xmm7, kRound34
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]           // pixels 0..7
+    movdqa     xmm1, [eax + esi]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm2
+    pmaddubsw  xmm0, xmm5
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx], xmm0
+    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm1, [eax + esi + 8]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm3
+    pmaddubsw  xmm0, xmm6
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    movq       qword ptr [edx + 8], xmm0
+    movdqa     xmm0, [eax + 16]      // pixels 16..23
+    movdqa     xmm1, [eax + esi + 16]
+    lea        eax, [eax + 32]
+    pavgb      xmm1, xmm0
+    pavgb      xmm0, xmm1
+    pshufb     xmm0, xmm4
+    movdqa     xmm1, kMadd21
+    pmaddubsw  xmm0, xmm1
+    paddsw     xmm0, xmm7
+    psrlw      xmm0, 2
+    packuswb   xmm0, xmm0
+    sub        ecx, 24
+    movq       qword ptr [edx + 16], xmm0
+    lea        edx, [edx+24]
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+                          uint8* dst_ptr, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_ptr
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_ptr
+    mov        ecx, [esp + 16]       // dst_width
+    movdqa     xmm4, kShuf38a
+    movdqa     xmm5, kShuf38b
+
+    align      4
+  xloop:
+    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
+    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    lea        eax, [eax + 32]
+    pshufb     xmm0, xmm4
+    pshufb     xmm1, xmm5
+    paddusb    xmm0, xmm1
+
+    sub        ecx, 12
+    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movhlps    xmm1, xmm0
+    movd       [edx + 8], xmm1
+    lea        edx, [edx + 12]
+    jg         xloop
+
+    ret
+  }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAc
+    movdqa     xmm3, kShufAc3
+    movdqa     xmm4, kScaleAc33
+    pxor       xmm5, xmm5
+
+    align      4
+  xloop:
+    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqa     xmm6, [eax + esi]
+    movhlps    xmm1, xmm0
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm0, xmm5
+    punpcklbw  xmm1, xmm5
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+    movdqa     xmm6, [eax + esi * 2]
+    lea        eax, [eax + 16]
+    movhlps    xmm7, xmm6
+    punpcklbw  xmm6, xmm5
+    punpcklbw  xmm7, xmm5
+    paddusw    xmm0, xmm6
+    paddusw    xmm1, xmm7
+
+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    psrldq     xmm0, 2
+    paddusw    xmm6, xmm0
+    pshufb     xmm6, xmm2
+
+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    psrldq     xmm1, 2
+    paddusw    xmm7, xmm1
+    pshufb     xmm7, xmm3
+    paddusw    xmm6, xmm7
+
+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    packuswb   xmm6, xmm6
+
+    sub        ecx, 6
+    movd       [edx], xmm6           // write 6 pixels
+    psrlq      xmm6, 16
+    movd       [edx + 2], xmm6
+    lea        edx, [edx + 6]
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8* dst_ptr, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_ptr
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_ptr
+    mov        ecx, [esp + 4 + 16]   // dst_width
+    movdqa     xmm2, kShufAb0
+    movdqa     xmm3, kShufAb1
+    movdqa     xmm4, kShufAb2
+    movdqa     xmm5, kScaleAb2
+
+    align      4
+  xloop:
+    movdqa     xmm0, [eax]           // average 2 rows into xmm0
+    pavgb      xmm0, [eax + esi]
+    lea        eax, [eax + 16]
+
+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    pshufb     xmm1, xmm2
+    movdqa     xmm6, xmm0
+    pshufb     xmm6, xmm3
+    paddusw    xmm1, xmm6
+    pshufb     xmm0, xmm4
+    paddusw    xmm1, xmm0
+
+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    packuswb   xmm1, xmm1
+
+    sub        ecx, 6
+    movd       [edx], xmm1           // write 6 pixels
+    psrlq      xmm1, 16
+    movd       [edx + 2], xmm1
+    lea        edx, [edx + 6]
+    jg         xloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
+__declspec(naked) __declspec(align(16))
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+                       uint16* dst_ptr, int src_width,
+                       int src_height) {
+  __asm {
+    push       esi
+    push       edi
+    push       ebx
+    push       ebp
+    mov        esi, [esp + 16 + 4]   // src_ptr
+    mov        edx, [esp + 16 + 8]   // src_stride
+    mov        edi, [esp + 16 + 12]  // dst_ptr
+    mov        ecx, [esp + 16 + 16]  // dst_width
+    mov        ebx, [esp + 16 + 20]  // height
+    pxor       xmm4, xmm4
+    dec        ebx
+
+    align      4
+  xloop:
+    // first row
+    movdqa     xmm0, [esi]
+    lea        eax, [esi + edx]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm4
+    punpckhbw  xmm1, xmm4
+    lea        esi, [esi + 16]
+    mov        ebp, ebx
+    test       ebp, ebp
+    je         ydone
+
+    // sum remaining rows
+    align      4
+  yloop:
+    movdqa     xmm2, [eax]       // read 16 pixels
+    lea        eax, [eax + edx]  // advance to next row
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm1, xmm3
+    sub        ebp, 1
+    jg         yloop
+
+    align      4
+  ydone:
+    movdqa     [edi], xmm0
+    movdqa     [edi + 16], xmm1
+    lea        edi, [edi + 32]
+
+    sub        ecx, 16
+    jg         xloop
+
+    pop        ebp
+    pop        ebx
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Bilinear column filtering. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+// TODO(fbarchard): Switch the following:
+//    xor        ebx, ebx
+//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
+// To
+//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+// when drmemory bug fixed.
+// https://code.google.com/p/drmemory/issues/detail?id=1396
+
+__declspec(naked) __declspec(align(16))
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+                           int dst_width, int x, int dx) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        edi, [esp + 12 + 4]    // dst_ptr
+    mov        esi, [esp + 12 + 8]    // src_ptr
+    mov        ecx, [esp + 12 + 12]   // dst_width
+    movd       xmm2, [esp + 12 + 16]  // x
+    movd       xmm3, [esp + 12 + 20]  // dx
+    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    movd       xmm5, eax
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+    align      4
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
+    movd       xmm4, ebx
+    pshufb     xmm1, xmm5           // 0011
+    punpcklwd  xmm0, xmm4
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
+    movd       ebx, xmm0
+    mov        [edi], bx
+    lea        edi, [edi + 2]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+    align      4
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
+    movd       xmm0, ebx
+    psrlw      xmm2, 9              // 7 bit fractions.
+    pshufb     xmm2, xmm5           // 0011
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // 16 bit
+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // 8 bits
+    movd       ebx, xmm0
+    mov        [edi], bl
+
+    align      4
+ xloop99:
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+                       int dst_width, int /* x */, int /* dx */) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_ptr
+    mov        eax, [esp + 8]    // src_ptr
+    mov        ecx, [esp + 12]   // dst_width
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpcklbw  xmm0, xmm0
+    punpckhbw  xmm1, xmm1
+    sub        ecx, 32
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+                            ptrdiff_t /* src_stride */,
+                            uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    shufps     xmm0, xmm1, 0xdd
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x1 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+                                  ptrdiff_t /* src_stride */,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    mov        eax, [esp + 4]        // src_argb
+                                     // src_stride ignored
+    mov        edx, [esp + 12]       // dst_argb
+    mov        ecx, [esp + 16]       // dst_width
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    lea        eax,  [eax + 32]
+    movdqa     xmm2, xmm0
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    ret
+  }
+}
+
+// Blends 8x2 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       esi
+    mov        eax, [esp + 4 + 4]    // src_argb
+    mov        esi, [esp + 4 + 8]    // src_stride
+    mov        edx, [esp + 4 + 12]   // dst_argb
+    mov        ecx, [esp + 4 + 16]   // dst_width
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    movdqa     xmm1, [eax + 16]
+    movdqa     xmm2, [eax + esi]
+    movdqa     xmm3, [eax + esi + 16]
+    lea        eax,  [eax + 32]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       edi
+    mov        eax, [esp + 8 + 4]    // src_argb
+                                     // src_stride ignored
+    mov        ebx, [esp + 8 + 12]   // src_stepx
+    mov        edx, [esp + 8 + 16]   // dst_argb
+    mov        ecx, [esp + 8 + 20]   // dst_width
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+    align      4
+  wloop:
+    movd       xmm0, [eax]
+    movd       xmm1, [eax + ebx]
+    punpckldq  xmm0, xmm1
+    movd       xmm2, [eax + ebx * 2]
+    movd       xmm3, [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        edi
+    pop        ebx
+    ret
+  }
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8* dst_argb, int dst_width) {
+  __asm {
+    push       ebx
+    push       esi
+    push       edi
+    mov        eax, [esp + 12 + 4]    // src_argb
+    mov        esi, [esp + 12 + 8]    // src_stride
+    mov        ebx, [esp + 12 + 12]   // src_stepx
+    mov        edx, [esp + 12 + 16]   // dst_argb
+    mov        ecx, [esp + 12 + 20]   // dst_width
+    lea        esi, [eax + esi]       // row1 pointer
+    lea        ebx, [ebx * 4]
+    lea        edi, [ebx + ebx * 2]
+
+    align      4
+  wloop:
+    movq       xmm0, qword ptr [eax]  // row0 4 pairs
+    movhps     xmm0, qword ptr [eax + ebx]
+    movq       xmm1, qword ptr [eax + ebx * 2]
+    movhps     xmm1, qword ptr [eax + edi]
+    lea        eax,  [eax + ebx * 4]
+    movq       xmm2, qword ptr [esi]  // row1 4 pairs
+    movhps     xmm2, qword ptr [esi + ebx]
+    movq       xmm3, qword ptr [esi + ebx * 2]
+    movhps     xmm3, qword ptr [esi + edi]
+    lea        esi,  [esi + ebx * 4]
+    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm1, xmm3
+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88      // even pixels
+    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    pavgb      xmm0, xmm2
+    sub        ecx, 4
+    movdqa     [edx], xmm0
+    lea        edx, [edx + 16]
+    jg         wloop
+
+    pop        edi
+    pop        esi
+    pop        ebx
+    ret
+  }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+                        int dst_width, int x, int dx) {
+  __asm {
+    push       edi
+    push       esi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+
+    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    paddd      xmm2, xmm0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0            // x3 x2 x1 x0
+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+
+    pextrw     eax, xmm2, 1          // get x0 integer.
+    pextrw     edx, xmm2, 3          // get x1 integer.
+
+    cmp        ecx, 0
+    jle        xloop99
+    sub        ecx, 4
+    jl         xloop49
+
+    // 4 Pixel loop.
+    align      4
+ xloop4:
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    pextrw     edx, xmm2, 7           // get x3 integer.
+    paddd      xmm2, xmm3             // x += dx
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
+    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4             // x2 x3
+    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    sub        ecx, 4                 // 4 pixels
+    movdqu     [edi], xmm0
+    lea        edi, [edi + 16]
+    jge        xloop4
+
+    align      4
+ xloop49:
+    test       ecx, 2
+    je         xloop29
+
+    // 2 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
+    pextrw     eax, xmm2, 5           // get x2 integer.
+    punpckldq  xmm0, xmm1             // x0 x1
+
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+
+ xloop29:
+    test       ecx, 1
+    je         xloop99
+
+    // 1 Pixels.
+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
+    movd       dword ptr [edi], xmm0
+    align      4
+ xloop99:
+
+    pop        esi
+    pop        edi
+    ret
+  }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked) __declspec(align(16))
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+                               int dst_width, int x, int dx) {
+  __asm {
+    push       esi
+    push       edi
+    mov        edi, [esp + 8 + 4]    // dst_argb
+    mov        esi, [esp + 8 + 8]    // src_argb
+    mov        ecx, [esp + 8 + 12]   // dst_width
+    movd       xmm2, [esp + 8 + 16]  // x
+    movd       xmm3, [esp + 8 + 20]  // dx
+    movdqa     xmm4, kShuffleColARGB
+    movdqa     xmm5, kShuffleFractions
+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    psrlw      xmm6, 9
+    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    sub        ecx, 2
+    jl         xloop29
+
+    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    paddd      xmm0, xmm3
+    punpckldq  xmm2, xmm0           // x0 x1
+    punpckldq  xmm3, xmm3           // dx dx
+    paddd      xmm3, xmm3           // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+
+    // 2 Pixel loop.
+    align      4
+  xloop2:
+    movdqa     xmm1, xmm2           // x0, x1 fractions.
+    paddd      xmm2, xmm3           // x += dx
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    psrlw      xmm1, 9              // 7 bit fractions.
+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
+    pshufb     xmm1, xmm5           // 0000000011111111
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm1, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    movq       qword ptr [edi], xmm0
+    lea        edi, [edi + 8]
+    sub        ecx, 2               // 2 pixels
+    jge        xloop2
+
+    align      4
+ xloop29:
+
+    add        ecx, 2 - 1
+    jl         xloop99
+
+    // 1 pixel remainder
+    psrlw      xmm2, 9              // 7 bit fractions.
+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
+    pshufb     xmm2, xmm5           // 00000000
+    pshufb     xmm0, xmm4           // arrange pixels into pairs
+    pxor       xmm2, xmm6           // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    psrlw      xmm0, 7
+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    movd       [edi], xmm0
+
+    align      4
+ xloop99:
+
+    pop        edi
+    pop        esi
+    ret
+  }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+                           int dst_width, int /* x */, int /* dx */) {
+  __asm {
+    mov        edx, [esp + 4]    // dst_argb
+    mov        eax, [esp + 8]    // src_argb
+    mov        ecx, [esp + 12]   // dst_width
+
+    align      4
+  wloop:
+    movdqa     xmm0, [eax]
+    lea        eax,  [eax + 16]
+    movdqa     xmm1, xmm0
+    punpckldq  xmm0, xmm0
+    punpckhdq  xmm1, xmm1
+    sub        ecx, 8
+    movdqa     [edx], xmm0
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    jg         wloop
+
+    ret
+  }
+}
+
+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
index f93e97bb71f..1b912b8ba82 100755
--- a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
@@ -73,9 +73,10 @@ class LibyuvTest(chrome_tests.ChromeTests):
 def main(_):
   parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>')
   parser.disable_interspersed_args()
-  parser.add_option('-b', '--build_dir',
+  parser.add_option('-b', '--build-dir',
                     help=('Location of the compiler output. Can only be used '
                           'when the test argument does not contain this path.'))
+  parser.add_option("--target", help="Debug or Release")
   parser.add_option('-t', '--test', help='Test to run.')
   parser.add_option('', '--baseline', action='store_true', default=False,
                     help='Generate baseline data instead of validating')
@@ -104,6 +105,11 @@ def main(_):
   if not options.test:
     parser.error('--test not specified')
 
+  # Support build dir both with and without the target.
+  if (options.target and options.build_dir and
+      not options.build_dir.endswith(options.target)):
+    options.build_dir = os.path.join(options.build_dir, options.target)
+
   # If --build_dir is provided, prepend it to the test executable if needed.
   test_executable = options.test
   if options.build_dir and not test_executable.startswith(options.build_dir):
diff --git a/chromium/third_party/libyuv/unit_test/compare_test.cc b/chromium/third_party/libyuv/unit_test/compare_test.cc
index 7fe6c3b0b19..efc2e39e68f 100644
--- a/chromium/third_party/libyuv/unit_test/compare_test.cc
+++ b/chromium/third_party/libyuv/unit_test/compare_test.cc
@@ -39,7 +39,7 @@ TEST_F(libyuvTest, Djb2_Test) {
       " and feels as if he were in the seventh heaven of typography"
       " together with Hermann Zapf";
   uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
-  const uint32 kExpectedFoxHash = 2611006483;
+  const uint32 kExpectedFoxHash = 2611006483u;
   EXPECT_EQ(kExpectedFoxHash, foxhash);
 
   for (int i = 0; i < kMaxTest; ++i) {
@@ -286,9 +286,9 @@ TEST_F(libyuvTest, Psnr) {
                       src_b + kSrcStride * b + b, kSrcStride,
                       kSrcWidth, kSrcHeight);
 
-  EXPECT_GT(err, 4.0);
+  EXPECT_GT(err, 2.0);
   if (kSrcWidth * kSrcHeight >= 256) {
-    EXPECT_LT(err, 5.0);
+    EXPECT_LT(err, 6.0);
   }
 
   srandom(time(NULL));
@@ -322,7 +322,7 @@ TEST_F(libyuvTest, Psnr) {
   free_aligned_buffer_64(src_b)
 }
 
-TEST_F(libyuvTest, BenchmarkSsim_Opt) {
+TEST_F(libyuvTest, DISABLED_BenchmarkSsim_Opt) {
   align_buffer_64(src_a, benchmark_width_ * benchmark_height_)
   align_buffer_64(src_b, benchmark_width_ * benchmark_height_)
   for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc
index 7e96c63a4d5..d5eaca0569b 100644
--- a/chromium/third_party/libyuv/unit_test/convert_test.cc
+++ b/chromium/third_party/libyuv/unit_test/convert_test.cc
@@ -1,990 +1,997 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <time.h>
-
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-#include "libyuv/convert_from.h"
-#include "libyuv/convert_from_argb.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "../unit_test/unit_test.h"
-
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#else  // __GNUC__
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#endif
-
-namespace libyuv {
-
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
-
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
-                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
-  align_buffer_64(src_u,                                                       \
-                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
-                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
-  align_buffer_64(src_v,                                                       \
-                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
-                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
-  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
-  align_buffer_64(dst_u_c,                                                     \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_v_c,                                                     \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
-  align_buffer_64(dst_u_opt,                                                   \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_v_opt,                                                   \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
-      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
-          (random() & 0xff);                                                   \
-      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
-          (random() & 0xff);                                                   \
-    }                                                                          \
-  }                                                                            \
-  MaskCpuFlags(0);                                                             \
-  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
-                                 src_u + OFF,                                  \
-                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
-                                 src_v + OFF,                                  \
-                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
-                                 dst_y_c, kWidth,                              \
-                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
-                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
-                                 kWidth, NEG kHeight);                         \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
-                                   src_u + OFF,                                \
-                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
-                                   src_v + OFF,                                \
-                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
-                                   dst_y_opt, kWidth,                          \
-                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
-                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
-                                   kWidth, NEG kHeight);                       \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth; ++j) {                                         \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
-              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_u_c[i *                                     \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
-              static_cast<int>(dst_u_opt[i *                                   \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_v_c[i *                                     \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
-              static_cast<int>(dst_v_opt[i *                                   \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  free_aligned_buffer_64(dst_y_c)                                              \
-  free_aligned_buffer_64(dst_u_c)                                              \
-  free_aligned_buffer_64(dst_v_c)                                              \
-  free_aligned_buffer_64(dst_y_opt)                                            \
-  free_aligned_buffer_64(dst_u_opt)                                            \
-  free_aligned_buffer_64(dst_v_opt)                                            \
-  free_aligned_buffer_64(src_y)                                                \
-  free_aligned_buffer_64(src_u)                                                \
-  free_aligned_buffer_64(src_v)                                                \
-}
-
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,            \
-                      FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                        \
-    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
-                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
-                   benchmark_width_ - 4, _Any, +, 0)                           \
-    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
-                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
-                   benchmark_width_, _Unaligned, +, 1)                         \
-    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
-                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
-                   benchmark_width_, _Invert, -, 0)                            \
-    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
-                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
-                   benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
-TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
-TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
-TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
-TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
-TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
-TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
-TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
-
-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
-                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
-  align_buffer_64(src_u,                                                       \
-                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
-                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
-  align_buffer_64(src_v,                                                       \
-                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
-                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
-  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
-  align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *                 \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
-  align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
-      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
-          (random() & 0xff);                                                   \
-      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
-          (random() & 0xff);                                                   \
-    }                                                                          \
-  }                                                                            \
-  MaskCpuFlags(0);                                                             \
-  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
-                                 src_u + OFF,                                  \
-                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
-                                 src_v + OFF,                                  \
-                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
-                                 dst_y_c, kWidth,                              \
-                                 dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X),   \
-                                 kWidth, NEG kHeight);                         \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
-                                   src_u + OFF,                                \
-                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
-                                   src_v + OFF,                                \
-                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
-                                   dst_y_opt, kWidth,                          \
-                                   dst_uv_opt,                                 \
-                                   SUBSAMPLE(kWidth * 2, SUBSAMP_X),           \
-                                   kWidth, NEG kHeight);                       \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth; ++j) {                                         \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
-              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) {               \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_uv_c[i *                                    \
-                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) -        \
-              static_cast<int>(dst_uv_opt[i *                                  \
-                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]));        \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  free_aligned_buffer_64(dst_y_c)                                              \
-  free_aligned_buffer_64(dst_uv_c)                                             \
-  free_aligned_buffer_64(dst_y_opt)                                            \
-  free_aligned_buffer_64(dst_uv_opt)                                           \
-  free_aligned_buffer_64(src_y)                                                \
-  free_aligned_buffer_64(src_u)                                                \
-  free_aligned_buffer_64(src_v)                                                \
-}
-
-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
-                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                       \
-    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
-                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
-                    benchmark_width_ - 4, _Any, +, 0)                          \
-    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
-                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
-                    benchmark_width_, _Unaligned, +, 1)                        \
-    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
-                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
-                    benchmark_width_, _Invert, -, 0)                           \
-    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
-                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
-                    benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
-TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
-
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
-                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
-  align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *               \
-                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
-  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
-  align_buffer_64(dst_u_c,                                                     \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_v_c,                                                     \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
-  align_buffer_64(dst_u_opt,                                                   \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_v_opt,                                                   \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
-    for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {           \
-      src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =           \
-          (random() & 0xff);                                                   \
-    }                                                                          \
-  }                                                                            \
-  MaskCpuFlags(0);                                                             \
-  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
-                                 src_uv + OFF,                                 \
-                                 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),         \
-                                 dst_y_c, kWidth,                              \
-                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
-                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
-                                 kWidth, NEG kHeight);                         \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
-                                   src_uv + OFF,                               \
-                                   2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
-                                   dst_y_opt, kWidth,                          \
-                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
-                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
-                                   kWidth, NEG kHeight);                       \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth; ++j) {                                         \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
-              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_u_c[i *                                     \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
-              static_cast<int>(dst_u_opt[i *                                   \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_v_c[i *                                     \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
-              static_cast<int>(dst_v_opt[i *                                   \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 1);                                                      \
-  free_aligned_buffer_64(dst_y_c)                                              \
-  free_aligned_buffer_64(dst_u_c)                                              \
-  free_aligned_buffer_64(dst_v_c)                                              \
-  free_aligned_buffer_64(dst_y_opt)                                            \
-  free_aligned_buffer_64(dst_u_opt)                                            \
-  free_aligned_buffer_64(dst_v_opt)                                            \
-  free_aligned_buffer_64(src_y)                                                \
-  free_aligned_buffer_64(src_uv)                                               \
-}
-
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
-                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                      \
-    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
-                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
-                     benchmark_width_ - 4, _Any, +, 0)                         \
-    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
-                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
-                     benchmark_width_, _Unaligned, +, 1)                       \
-    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
-                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
-                     benchmark_width_, _Invert, -, 0)                          \
-    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
-                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
-                     benchmark_width_, _Opt, +, 0)
-
-TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
-TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
-
-#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
-                       W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)                 \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  const int kStrideB = ((kWidth * BPP_B + ALIGN - 1) / ALIGN) * ALIGN;         \
-  const int kSizeUV =                                                          \
-    SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);              \
-  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
-  align_buffer_64(src_u, kSizeUV + OFF);                                       \
-  align_buffer_64(src_v, kSizeUV + OFF);                                       \
-  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
-  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
-  memset(dst_argb_c, 0, kStrideB * kHeight);                                   \
-  memset(dst_argb_opt, 0, kStrideB * kHeight);                                 \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
-    src_y[i + OFF] = (random() & 0xff);                                        \
-  }                                                                            \
-  for (int i = 0; i < kSizeUV; ++i) {                                          \
-    src_u[i + OFF] = (random() & 0xff);                                        \
-    src_v[i + OFF] = (random() & 0xff);                                        \
-  }                                                                            \
-  MaskCpuFlags(0);                                                             \
-  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
-                        src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
-                        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
-                        dst_argb_c, kStrideB,                                  \
-                        kWidth, NEG kHeight);                                  \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
-                          src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
-                          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
-                          dst_argb_opt, kStrideB,                              \
-                          kWidth, NEG kHeight);                                \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
-  align_buffer_64(dst_argb32_c, kWidth * BPP_C  * kHeight);                    \
-  align_buffer_64(dst_argb32_opt, kWidth * BPP_C  * kHeight);                  \
-  memset(dst_argb32_c, 0, kWidth * BPP_C  * kHeight);                          \
-  memset(dst_argb32_opt, 0, kWidth * BPP_C  * kHeight);                        \
-  FMT_B##To##FMT_C(dst_argb_c, kStrideB,                                       \
-                   dst_argb32_c, kWidth * BPP_C ,                              \
-                   kWidth, kHeight);                                           \
-  FMT_B##To##FMT_C(dst_argb_opt, kStrideB,                                     \
-                   dst_argb32_opt, kWidth * BPP_C ,                            \
-                   kWidth, kHeight);                                           \
-  for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                         \
-    int abs_diff =                                                             \
-        abs(static_cast<int>(dst_argb32_c[i]) -                                \
-            static_cast<int>(dst_argb32_opt[i]));                              \
-    if (abs_diff > max_diff) {                                                 \
-      max_diff = abs_diff;                                                     \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, DIFF);                                                   \
-  free_aligned_buffer_64(src_y)                                                \
-  free_aligned_buffer_64(src_u)                                                \
-  free_aligned_buffer_64(src_v)                                                \
-  free_aligned_buffer_64(dst_argb_c)                                           \
-  free_aligned_buffer_64(dst_argb_opt)                                         \
-  free_aligned_buffer_64(dst_argb32_c)                                         \
-  free_aligned_buffer_64(dst_argb32_opt)                                       \
-}
-
-#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,   \
-                      DIFF, FMT_C, BPP_C)                                      \
-    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C)       \
-    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C)     \
-    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C)        \
-    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
-                   benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
-
-TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 17, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 0, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 0, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 0, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 1, 2, ARGB, 4)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
-                         W1280, DIFF, N, NEG, OFF)                             \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  const int kStrideB = kWidth * BPP_B;                                         \
-  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
-  align_buffer_64(src_uv,                                                      \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);                    \
-  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
-  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kWidth; ++j)                                           \
-      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i)                      \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {               \
-      src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] =               \
-          (random() & 0xff);                                                   \
-    }                                                                          \
-  MaskCpuFlags(0);                                                             \
-  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
-                        src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,        \
-                        dst_argb_c, kWidth * BPP_B,                            \
-                        kWidth, NEG kHeight);                                  \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
-                          src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,      \
-                          dst_argb_opt, kWidth * BPP_B,                        \
-                          kWidth, NEG kHeight);                                \
-  }                                                                            \
-  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
-  align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight);                         \
-  align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight);                       \
-  memset(dst_argb32_c, 1, kWidth * 4 * kHeight);                               \
-  memset(dst_argb32_opt, 2, kWidth * 4 * kHeight);                             \
-  FMT_B##ToARGB(dst_argb_c, kStrideB,                                          \
-                dst_argb32_c, kWidth * 4,                                      \
-                kWidth, kHeight);                                              \
-  FMT_B##ToARGB(dst_argb_opt, kStrideB,                                        \
-                dst_argb32_opt, kWidth * 4,                                    \
-                kWidth, kHeight);                                              \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth * 4; ++j) {                                     \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) -             \
-              static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j]));           \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, DIFF);                                                   \
-  free_aligned_buffer_64(src_y)                                                \
-  free_aligned_buffer_64(src_uv)                                               \
-  free_aligned_buffer_64(dst_argb_c)                                           \
-  free_aligned_buffer_64(dst_argb_opt)                                         \
-  free_aligned_buffer_64(dst_argb32_c)                                         \
-  free_aligned_buffer_64(dst_argb32_opt)                                       \
-}
-
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF)  \
-    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
-                     benchmark_width_ - 4, DIFF, _Any, +, 0)                   \
-    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
-                     benchmark_width_, DIFF, _Unaligned, +, 1)                 \
-    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
-                     benchmark_width_, DIFF, _Invert, -, 0)                    \
-    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
-                     benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
-TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
-
-#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,         \
-                       W1280, DIFF, N, NEG, OFF)                               \
-TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  const int kStride = (kWidth * 8 * BPP_A + 7) / 8;                            \
-  align_buffer_64(src_argb, kStride * kHeight + OFF);                          \
-  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
-  align_buffer_64(dst_u_c,                                                     \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_v_c,                                                     \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
-  align_buffer_64(dst_u_opt,                                                   \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_v_opt,                                                   \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  memset(dst_y_c, 1, kWidth * kHeight);                                        \
-  memset(dst_u_c, 0,                                                           \
-         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-  memset(dst_v_c, 0,                                                           \
-         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-  memset(dst_y_opt, 2, kWidth * kHeight);                                      \
-  memset(dst_u_opt, 0,                                                         \
-         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-  memset(dst_v_opt, 0,                                                         \
-         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kStride; ++j)                                          \
-      src_argb[(i * kStride) + j + OFF] = (random() & 0xff);                   \
-  MaskCpuFlags(0);                                                             \
-  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
-                        dst_y_c, kWidth,                                       \
-                        dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),                 \
-                        dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),                 \
-                        kWidth, NEG kHeight);                                  \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
-                          dst_y_opt, kWidth,                                   \
-                          dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),             \
-                          dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),             \
-                          kWidth, NEG kHeight);                                \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth; ++j) {                                         \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
-              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, DIFF);                                                   \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_u_c[i *                                     \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
-              static_cast<int>(dst_u_opt[i *                                   \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, DIFF);                                                   \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_v_c[i *                                     \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
-              static_cast<int>(dst_v_opt[i *                                   \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, DIFF);                                                   \
-  free_aligned_buffer_64(dst_y_c)                                              \
-  free_aligned_buffer_64(dst_u_c)                                              \
-  free_aligned_buffer_64(dst_v_c)                                              \
-  free_aligned_buffer_64(dst_y_opt)                                            \
-  free_aligned_buffer_64(dst_u_opt)                                            \
-  free_aligned_buffer_64(dst_v_opt)                                            \
-  free_aligned_buffer_64(src_argb)                                             \
-}
-
-#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, DIFF)    \
-    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,             \
-                   benchmark_width_ - 4, DIFF, _Any, +, 0)                     \
-    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,             \
-                   benchmark_width_, DIFF, _Unaligned, +, 1)                   \
-    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,             \
-                   benchmark_width_, DIFF, _Invert, -, 0)                      \
-    TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,             \
-                   benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTATOPLANAR(ARGB, 4, I420, 2, 2, 4)
-#ifdef __arm__
-TESTATOPLANAR(ARGB, 4, J420, 2, 2, 4)
-#else
-TESTATOPLANAR(ARGB, 4, J420, 2, 2, 0)
-#endif
-TESTATOPLANAR(BGRA, 4, I420, 2, 2, 4)
-TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4)
-TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4)
-TESTATOPLANAR(RAW, 3, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, I420, 2, 2, 4)
-TESTATOPLANAR(RGB565, 2, I420, 2, 2, 5)
-// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
-TESTATOPLANAR(ARGB1555, 2, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, I420, 2, 2, 17)
-TESTATOPLANAR(ARGB, 4, I411, 4, 1, 4)
-TESTATOPLANAR(ARGB, 4, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, I444, 1, 1, 2)
-TESTATOPLANAR(YUY2, 2, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, I422, 2, 1, 2)
-TESTATOPLANAR(UYVY, 2, I422, 2, 1, 2)
-TESTATOPLANAR(I400, 1, I420, 2, 2, 2)
-TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2, 4)
-
-#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,       \
-                       W1280, N, NEG, OFF)                                     \
-TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  const int kStride = (kWidth * 8 * BPP_A + 7) / 8;                            \
-  align_buffer_64(src_argb, kStride * kHeight + OFF);                          \
-  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
-  align_buffer_64(dst_uv_c,                                                    \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *                           \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
-  align_buffer_64(dst_uv_opt,                                                  \
-                  SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *                           \
-                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kHeight; ++i)                                            \
-    for (int j = 0; j < kStride; ++j)                                          \
-      src_argb[(i * kStride) + j + OFF] = (random() & 0xff);                   \
-  MaskCpuFlags(0);                                                             \
-  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
-                        dst_y_c, kWidth,                                       \
-                        dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,            \
-                        kWidth, NEG kHeight);                                  \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
-                          dst_y_opt, kWidth,                                   \
-                          dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,        \
-                          kWidth, NEG kHeight);                                \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kHeight; ++i) {                                          \
-    for (int j = 0; j < kWidth; ++j) {                                         \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
-              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 4);                                                      \
-  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
-    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {               \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_uv_c[i *                                    \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) -        \
-              static_cast<int>(dst_uv_opt[i *                                  \
-                               SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]));        \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, 4);                                                      \
-  free_aligned_buffer_64(dst_y_c)                                              \
-  free_aligned_buffer_64(dst_uv_c)                                             \
-  free_aligned_buffer_64(dst_y_opt)                                            \
-  free_aligned_buffer_64(dst_uv_opt)                                           \
-  free_aligned_buffer_64(src_argb)                                             \
-}
-
-#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)        \
-    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_ - 4, _Any, +, 0)                           \
-    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_, _Unaligned, +, 1)                         \
-    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_, _Invert, -, 0)                            \
-    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
-                   benchmark_width_, _Opt, +, 0)
-
-TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
-
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                      \
-                  FMT_B, BPP_B, STRIDE_B,                                      \
-                  W1280, DIFF, N, NEG, OFF)                                    \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##N) {                                      \
-  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
-  const int kHeight = benchmark_height_;                                       \
-  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
-  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
-  align_buffer_64(src_argb, kStrideA * kHeight + OFF);                         \
-  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
-  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
-  memset(dst_argb_c, 0, kStrideB * kHeight);                                   \
-  memset(dst_argb_opt, 0, kStrideB * kHeight);                                 \
-  srandom(time(NULL));                                                         \
-  for (int i = 0; i < kStrideA * kHeight; ++i) {                               \
-    src_argb[i + OFF] = (random() & 0xff);                                     \
-  }                                                                            \
-  MaskCpuFlags(0);                                                             \
-  FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                   \
-                   dst_argb_c, kStrideB,                                       \
-                   kWidth, NEG kHeight);                                       \
-  MaskCpuFlags(-1);                                                            \
-  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
-    FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                 \
-                     dst_argb_opt, kStrideB,                                   \
-                     kWidth, NEG kHeight);                                     \
-  }                                                                            \
-  int max_diff = 0;                                                            \
-  for (int i = 0; i < kStrideB * kHeight; ++i) {                               \
-    int abs_diff =                                                             \
-        abs(static_cast<int>(dst_argb_c[i]) -                                  \
-            static_cast<int>(dst_argb_opt[i]));                                \
-    if (abs_diff > max_diff) {                                                 \
-      max_diff = abs_diff;                                                     \
-    }                                                                          \
-  }                                                                            \
-  EXPECT_LE(max_diff, DIFF);                                                   \
-  free_aligned_buffer_64(src_argb)                                             \
-  free_aligned_buffer_64(dst_argb_c)                                           \
-  free_aligned_buffer_64(dst_argb_opt)                                         \
-}
-
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                       \
-                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
-  srandom(time(NULL));                                                         \
-  for (int times = 0; times < benchmark_iterations_; ++times) {                \
-    const int kWidth = (random() & 63) + 1;                                    \
-    const int kHeight = (random() & 31) + 1;                                   \
-    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
-    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
-    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
-    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
-    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
-    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
-    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
-    memset(dst_argb_c, 0, kStrideB * kHeightB);                                \
-    memset(dst_argb_opt, 0, kStrideB * kHeightB);                              \
-    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
-      src_argb[i] = (random() & 0xff);                                         \
-    }                                                                          \
-    MaskCpuFlags(0);                                                           \
-    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
-                     dst_argb_c, kStrideB,                                     \
-                     kWidth, kHeight);                                         \
-    MaskCpuFlags(-1);                                                          \
-    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
-                     dst_argb_opt, kStrideB,                                   \
-                     kWidth, kHeight);                                         \
-    int max_diff = 0;                                                          \
-    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
-      int abs_diff =                                                           \
-          abs(static_cast<int>(dst_argb_c[i]) -                                \
-              static_cast<int>(dst_argb_opt[i]));                              \
-      if (abs_diff > max_diff) {                                               \
-        max_diff = abs_diff;                                                   \
-      }                                                                        \
-    }                                                                          \
-    EXPECT_LE(max_diff, DIFF);                                                 \
-    free_aligned_buffer_page_end(src_argb)                                     \
-    free_aligned_buffer_page_end(dst_argb_c)                                   \
-    free_aligned_buffer_page_end(dst_argb_opt)                                 \
-  }                                                                            \
-}
-
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                             \
-                 FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                       \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
-              FMT_B, BPP_B, STRIDE_B,                                          \
-              benchmark_width_ - 4, DIFF, _Any, +, 0)                          \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
-              FMT_B, BPP_B, STRIDE_B,                                          \
-              benchmark_width_, DIFF, _Unaligned, +, 1)                        \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
-              FMT_B, BPP_B, STRIDE_B,                                          \
-              benchmark_width_, DIFF, _Invert, -, 0)                           \
-    TESTATOBI(FMT_A, BPP_A, STRIDE_A,                                          \
-              FMT_B, BPP_B, STRIDE_B,                                          \
-              benchmark_width_, DIFF, _Opt, +, 0)                              \
-    TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
-                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
-
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
-TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
-
-TEST_F(libyuvTest, Test565) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 pixels565[256][2]);
-
-  for (int i = 0; i < 256; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      orig_pixels[i][j] = i;
-    }
-  }
-  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
-  uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
-  EXPECT_EQ(610919429u, checksum);
-}
-
-#ifdef HAVE_JPEG
-TEST_F(libyuvTest, ValidateJpeg) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
-    benchmark_width_ * benchmark_height_ : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_64(orig_pixels, kSize);
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kSize);
-
-  // EOI, SOI. Expect pass.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
-  }
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(libyuvTest, InvalidateJpeg) {
-  const int kOff = 10;
-  const int kMinJpeg = 64;
-  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
-    benchmark_width_ * benchmark_height_ : kMinJpeg;
-  const int kSize = kImageSize + kOff;
-  align_buffer_64(orig_pixels, kSize);
-
-  // No SOI or EOI. Expect fail.
-  memset(orig_pixels, 0, kSize);
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  // SOI but no EOI. Expect fail.
-  orig_pixels[0] = 0xff;
-  orig_pixels[1] = 0xd8;  // SOI.
-  for (int times = 0; times < benchmark_iterations_; ++times) {
-    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-  }
-  // EOI but no SOI. Expect fail.
-  orig_pixels[0] = 0;
-  orig_pixels[1] = 0;
-  orig_pixels[kSize - kOff + 0] = 0xff;
-  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
-  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
-  free_aligned_buffer_page_end(orig_pixels);
-}
-
-#endif
-
-}  // namespace libyuv
+/*
+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else  // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(src_v,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_u_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_u_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
+      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (random() & 0xff);                                                   \
+      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (random() & 0xff);                                                   \
+    }                                                                          \
+  }                                                                            \
+  MaskCpuFlags(0);                                                             \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_u + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 src_v + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 dst_y_c, kWidth,                              \
+                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_u + OFF,                                \
+                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   src_v + OFF,                                \
+                                       SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 0);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_u_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 3);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_v_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 3);                                                      \
+  free_aligned_buffer_64(dst_y_c)                                              \
+  free_aligned_buffer_64(dst_u_c)                                              \
+  free_aligned_buffer_64(dst_v_c)                                              \
+  free_aligned_buffer_64(dst_y_opt)                                            \
+  free_aligned_buffer_64(dst_u_opt)                                            \
+  free_aligned_buffer_64(dst_v_opt)                                            \
+  free_aligned_buffer_64(src_y)                                                \
+  free_aligned_buffer_64(src_u)                                                \
+  free_aligned_buffer_64(src_v)                                                \
+}
+
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,            \
+                      FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                        \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_ - 4, _Any, +, 0)                           \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Unaligned, +, 1)                         \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Invert, -, 0)                            \
+    TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,               \
+                   FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                           \
+                   benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
+TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
+TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
+TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
+TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
+TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
+TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
+TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF)   \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(src_v,                                                       \
+                  SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *                           \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *                 \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {               \
+      src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (random() & 0xff);                                                   \
+      src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =                \
+          (random() & 0xff);                                                   \
+    }                                                                          \
+  }                                                                            \
+  MaskCpuFlags(0);                                                             \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_u + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 src_v + OFF,                                  \
+                                 SUBSAMPLE(kWidth, SRC_SUBSAMP_X),             \
+                                 dst_y_c, kWidth,                              \
+                                 dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X),   \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_u + OFF,                                \
+                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
+                                   src_v + OFF,                                \
+                                   SUBSAMPLE(kWidth, SRC_SUBSAMP_X),           \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_uv_opt,                                 \
+                                   SUBSAMPLE(kWidth * 2, SUBSAMP_X),           \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) {               \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_uv_c[i *                                    \
+                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) -        \
+              static_cast<int>(dst_uv_opt[i *                                  \
+                               SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  free_aligned_buffer_64(dst_y_c)                                              \
+  free_aligned_buffer_64(dst_uv_c)                                             \
+  free_aligned_buffer_64(dst_y_opt)                                            \
+  free_aligned_buffer_64(dst_uv_opt)                                           \
+  free_aligned_buffer_64(src_y)                                                \
+  free_aligned_buffer_64(src_u)                                                \
+  free_aligned_buffer_64(src_v)                                                \
+}
+
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,           \
+                       FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                       \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_ - 4, _Any, +, 0)                          \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Unaligned, +, 1)                        \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Invert, -, 0)                           \
+    TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,              \
+                    FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                          \
+                    benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
+TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
+                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {                        \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *               \
+                  SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF);                    \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_u_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_u_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {                \
+    for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {           \
+      src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =           \
+          (random() & 0xff);                                                   \
+    }                                                                          \
+  }                                                                            \
+  MaskCpuFlags(0);                                                             \
+  SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                          \
+                                 src_uv + OFF,                                 \
+                                 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),         \
+                                 dst_y_c, kWidth,                              \
+                                 dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),        \
+                                 kWidth, NEG kHeight);                         \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth,                        \
+                                   src_uv + OFF,                               \
+                                   2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X),       \
+                                   dst_y_opt, kWidth,                          \
+                                   dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),    \
+                                   kWidth, NEG kHeight);                       \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_u_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_v_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 1);                                                      \
+  free_aligned_buffer_64(dst_y_c)                                              \
+  free_aligned_buffer_64(dst_u_c)                                              \
+  free_aligned_buffer_64(dst_v_c)                                              \
+  free_aligned_buffer_64(dst_y_opt)                                            \
+  free_aligned_buffer_64(dst_u_opt)                                            \
+  free_aligned_buffer_64(dst_v_opt)                                            \
+  free_aligned_buffer_64(src_y)                                                \
+  free_aligned_buffer_64(src_uv)                                               \
+}
+
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,          \
+                        FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                      \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_ - 4, _Any, +, 0)                         \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Unaligned, +, 1)                       \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Invert, -, 0)                          \
+    TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,             \
+                     FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,                         \
+                     benchmark_width_, _Opt, +, 0)
+
+TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
+TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,  \
+                       YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C)         \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN);                        \
+  const int kSizeUV =                                                          \
+    SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y);              \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_u, kSizeUV + OFF);                                       \
+  align_buffer_64(src_v, kSizeUV + OFF);                                       \
+  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
+  memset(dst_argb_c, 0, kStrideB * kHeight);                                   \
+  memset(dst_argb_opt, 0, kStrideB * kHeight);                                 \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kWidth * kHeight; ++i) {                                 \
+    src_y[i + OFF] = (random() & 0xff);                                        \
+  }                                                                            \
+  for (int i = 0; i < kSizeUV; ++i) {                                          \
+    src_u[i + OFF] = (random() & 0xff);                                        \
+    src_v[i + OFF] = (random() & 0xff);                                        \
+  }                                                                            \
+  MaskCpuFlags(0);                                                             \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                        src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                        dst_argb_c, kStrideB,                                  \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
+                          src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X),           \
+                          dst_argb_opt, kStrideB,                              \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_64(dst_argb32_c, kWidth * BPP_C  * kHeight);                    \
+  align_buffer_64(dst_argb32_opt, kWidth * BPP_C  * kHeight);                  \
+  memset(dst_argb32_c, 0, kWidth * BPP_C  * kHeight);                          \
+  memset(dst_argb32_opt, 0, kWidth * BPP_C  * kHeight);                        \
+  FMT_B##To##FMT_C(dst_argb_c, kStrideB,                                       \
+                   dst_argb32_c, kWidth * BPP_C ,                              \
+                   kWidth, kHeight);                                           \
+  FMT_B##To##FMT_C(dst_argb_opt, kStrideB,                                     \
+                   dst_argb32_opt, kWidth * BPP_C ,                            \
+                   kWidth, kHeight);                                           \
+  for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) {                         \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb32_c[i]) -                                \
+            static_cast<int>(dst_argb32_opt[i]));                              \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_y)                                                \
+  free_aligned_buffer_64(src_u)                                                \
+  free_aligned_buffer_64(src_v)                                                \
+  free_aligned_buffer_64(dst_argb_c)                                           \
+  free_aligned_buffer_64(dst_argb_opt)                                         \
+  free_aligned_buffer_64(dst_argb32_c)                                         \
+  free_aligned_buffer_64(dst_argb32_opt)                                       \
+}
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,   \
+                      YALIGN, DIFF, FMT_C, BPP_C)                              \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C)          \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C)        \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C)           \
+    TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,      \
+        YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+// TODO(fbarchard): Make vertical alignment unnecessary on bayer.
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 2, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 2, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 2, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 2, 2, 2, ARGB, 4)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
+                         W1280, DIFF, N, NEG, OFF)                             \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStrideB = kWidth * BPP_B;                                         \
+  align_buffer_64(src_y, kWidth * kHeight + OFF);                              \
+  align_buffer_64(src_uv,                                                      \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF);                    \
+  align_buffer_64(dst_argb_c, kStrideB * kHeight);                             \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeight);                           \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kWidth; ++j)                                           \
+      src_y[(i * kWidth) + j + OFF] = (random() & 0xff);                       \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i)                      \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {               \
+      src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] =               \
+          (random() & 0xff);                                                   \
+    }                                                                          \
+  MaskCpuFlags(0);                                                             \
+  FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                   \
+                        src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,        \
+                        dst_argb_c, kWidth * BPP_B,                            \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth,                                 \
+                          src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,      \
+                          dst_argb_opt, kWidth * BPP_B,                        \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
+  align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight);                         \
+  align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight);                       \
+  memset(dst_argb32_c, 1, kWidth * 4 * kHeight);                               \
+  memset(dst_argb32_opt, 2, kWidth * 4 * kHeight);                             \
+  FMT_B##ToARGB(dst_argb_c, kStrideB,                                          \
+                dst_argb32_c, kWidth * 4,                                      \
+                kWidth, kHeight);                                              \
+  FMT_B##ToARGB(dst_argb_opt, kStrideB,                                        \
+                dst_argb32_opt, kWidth * 4,                                    \
+                kWidth, kHeight);                                              \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth * 4; ++j) {                                     \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) -             \
+              static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j]));           \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_y)                                                \
+  free_aligned_buffer_64(src_uv)                                               \
+  free_aligned_buffer_64(dst_argb_c)                                           \
+  free_aligned_buffer_64(dst_argb_opt)                                         \
+  free_aligned_buffer_64(dst_argb32_c)                                         \
+  free_aligned_buffer_64(dst_argb32_opt)                                       \
+}
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF)  \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_ - 4, DIFF, _Any, +, 0)                   \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Unaligned, +, 1)                 \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Invert, -, 0)                    \
+    TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,           \
+                     benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+                       W1280, DIFF, N, NEG, OFF)                               \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = ALIGNINT(benchmark_height_, YALIGN);                     \
+  const int kStride =                                                          \
+      (SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMP_X * 8 * BPP_A + 7) / 8;          \
+  align_buffer_64(src_argb, kStride * kHeight + OFF);                          \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_u_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_c,                                                     \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_u_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_v_opt,                                                   \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) *                               \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 0,                                                           \
+         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+  memset(dst_v_c, 0,                                                           \
+         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+  memset(dst_y_opt, 2, kWidth * kHeight);                                      \
+  memset(dst_u_opt, 0,                                                         \
+         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+  memset(dst_v_opt, 0,                                                         \
+         SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y));        \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j + OFF] = (random() & 0xff);                   \
+  MaskCpuFlags(0);                                                             \
+  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
+                        dst_y_c, kWidth,                                       \
+                        dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X),                 \
+                        dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X),                 \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
+                          dst_y_opt, kWidth,                                   \
+                          dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                          dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X),             \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_u_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_u_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {                   \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_v_c[i *                                     \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -            \
+              static_cast<int>(dst_v_opt[i *                                   \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) + j]));            \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(dst_y_c)                                              \
+  free_aligned_buffer_64(dst_u_c)                                              \
+  free_aligned_buffer_64(dst_v_c)                                              \
+  free_aligned_buffer_64(dst_y_opt)                                            \
+  free_aligned_buffer_64(dst_u_opt)                                            \
+  free_aligned_buffer_64(dst_v_opt)                                            \
+  free_aligned_buffer_64(src_argb)                                             \
+}
+
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,  \
+                      DIFF)                                                    \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_ - 4, DIFF, _Any, +, 0)                     \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Unaligned, +, 1)                   \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Invert, -, 0)                      \
+    TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,     \
+                   benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
+#ifdef __arm__
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
+#else
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
+// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+TESTATOPLANAR(ARGB, 4, 1, I411, 4, 1, 4)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
+TESTATOPLANAR(BayerBGGR, 1, 2, I420, 2, 2, 4)
+TESTATOPLANAR(BayerRGGB, 1, 2, I420, 2, 2, 4)
+TESTATOPLANAR(BayerGBRG, 1, 2, I420, 2, 2, 4)
+TESTATOPLANAR(BayerGRBG, 1, 2, I420, 2, 2, 4)
+
+#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,       \
+                       W1280, N, NEG, OFF)                                     \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kStride = (kWidth * 8 * BPP_A + 7) / 8;                            \
+  align_buffer_64(src_argb, kStride * kHeight + OFF);                          \
+  align_buffer_64(dst_y_c, kWidth * kHeight);                                  \
+  align_buffer_64(dst_uv_c,                                                    \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *                           \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
+  align_buffer_64(dst_uv_opt,                                                  \
+                  SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *                           \
+                  SUBSAMPLE(kHeight, SUBSAMP_Y));                              \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kHeight; ++i)                                            \
+    for (int j = 0; j < kStride; ++j)                                          \
+      src_argb[(i * kStride) + j + OFF] = (random() & 0xff);                   \
+  MaskCpuFlags(0);                                                             \
+  FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                               \
+                        dst_y_c, kWidth,                                       \
+                        dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,            \
+                        kWidth, NEG kHeight);                                  \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride,                             \
+                          dst_y_opt, kWidth,                                   \
+                          dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2,        \
+                          kWidth, NEG kHeight);                                \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kHeight; ++i) {                                          \
+    for (int j = 0; j < kWidth; ++j) {                                         \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_y_c[i * kWidth + j]) -                      \
+              static_cast<int>(dst_y_opt[i * kWidth + j]));                    \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 4);                                                      \
+  for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                    \
+    for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {               \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_uv_c[i *                                    \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) -        \
+              static_cast<int>(dst_uv_opt[i *                                  \
+                               SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]));        \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, 4);                                                      \
+  free_aligned_buffer_64(dst_y_c)                                              \
+  free_aligned_buffer_64(dst_uv_c)                                             \
+  free_aligned_buffer_64(dst_y_opt)                                            \
+  free_aligned_buffer_64(dst_uv_opt)                                           \
+  free_aligned_buffer_64(src_argb)                                             \
+}
+
+#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)        \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_ - 4, _Any, +, 0)                           \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_, _Unaligned, +, 1)                         \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_, _Invert, -, 0)                            \
+    TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,           \
+                   benchmark_width_, _Opt, +, 0)
+
+TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                            \
+                  FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                            \
+                  W1280, DIFF, N, NEG, OFF)                                    \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##N) {                                      \
+  const int kWidth = ((W1280) > 0) ? (W1280) : 1;                              \
+  const int kHeight = benchmark_height_;                                       \
+  const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;         \
+  const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;         \
+  const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;  \
+  const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;  \
+  align_buffer_64(src_argb, kStrideA * kHeightA + OFF);                        \
+  align_buffer_64(dst_argb_c, kStrideB * kHeightB);                            \
+  align_buffer_64(dst_argb_opt, kStrideB * kHeightB);                          \
+  memset(dst_argb_c, 0, kStrideB * kHeightB);                                  \
+  memset(dst_argb_opt, 0, kStrideB * kHeightB);                                \
+  srandom(time(NULL));                                                         \
+  for (int i = 0; i < kStrideA * kHeightA; ++i) {                              \
+    src_argb[i + OFF] = (random() & 0xff);                                     \
+  }                                                                            \
+  MaskCpuFlags(0);                                                             \
+  FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                   \
+                   dst_argb_c, kStrideB,                                       \
+                   kWidth, NEG kHeight);                                       \
+  MaskCpuFlags(-1);                                                            \
+  for (int i = 0; i < benchmark_iterations_; ++i) {                            \
+    FMT_A##To##FMT_B(src_argb + OFF, kStrideA,                                 \
+                     dst_argb_opt, kStrideB,                                   \
+                     kWidth, NEG kHeight);                                     \
+  }                                                                            \
+  int max_diff = 0;                                                            \
+  for (int i = 0; i < kStrideB * kHeightB; ++i) {                              \
+    int abs_diff =                                                             \
+        abs(static_cast<int>(dst_argb_c[i]) -                                  \
+            static_cast<int>(dst_argb_opt[i]));                                \
+    if (abs_diff > max_diff) {                                                 \
+      max_diff = abs_diff;                                                     \
+    }                                                                          \
+  }                                                                            \
+  EXPECT_LE(max_diff, DIFF);                                                   \
+  free_aligned_buffer_64(src_argb)                                             \
+  free_aligned_buffer_64(dst_argb_c)                                           \
+  free_aligned_buffer_64(dst_argb_opt)                                         \
+}
+
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                       \
+                       FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                 \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
+  srandom(time(NULL));                                                         \
+  for (int times = 0; times < benchmark_iterations_; ++times) {                \
+    const int kWidth = (random() & 63) + 1;                                    \
+    const int kHeight = (random() & 31) + 1;                                   \
+    const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A;       \
+    const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B;       \
+    const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+    const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
+    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
+    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    memset(dst_argb_c, 0, kStrideB * kHeightB);                                \
+    memset(dst_argb_opt, 0, kStrideB * kHeightB);                              \
+    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
+      src_argb[i] = (random() & 0xff);                                         \
+    }                                                                          \
+    MaskCpuFlags(0);                                                           \
+    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
+                     dst_argb_c, kStrideB,                                     \
+                     kWidth, kHeight);                                         \
+    MaskCpuFlags(-1);                                                          \
+    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
+                     dst_argb_opt, kStrideB,                                   \
+                     kWidth, kHeight);                                         \
+    int max_diff = 0;                                                          \
+    for (int i = 0; i < kStrideB * kHeightB; ++i) {                            \
+      int abs_diff =                                                           \
+          abs(static_cast<int>(dst_argb_c[i]) -                                \
+              static_cast<int>(dst_argb_opt[i]));                              \
+      if (abs_diff > max_diff) {                                               \
+        max_diff = abs_diff;                                                   \
+      }                                                                        \
+    }                                                                          \
+    EXPECT_LE(max_diff, DIFF);                                                 \
+    free_aligned_buffer_page_end(src_argb)                                     \
+    free_aligned_buffer_page_end(dst_argb_c)                                   \
+    free_aligned_buffer_page_end(dst_argb_opt)                                 \
+  }                                                                            \
+}
+
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                             \
+                 FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)                       \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_ - 4, DIFF, _Any, +, 0)                          \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Unaligned, +, 1)                        \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Invert, -, 0)                           \
+    TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                                \
+              FMT_B, BPP_B, STRIDE_B, HEIGHT_B,                                \
+              benchmark_width_, DIFF, _Opt, +, 0)                              \
+    TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A,                           \
+                   FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+
+TEST_F(libyuvTest, Test565) {
+  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+  for (int i = 0; i < 256; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      orig_pixels[i][j] = i;
+    }
+  }
+  ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+  uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+  EXPECT_EQ(610919429u, checksum);
+}
+
+#ifdef HAVE_JPEG
+TEST_F(libyuvTest, ValidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_64(orig_pixels, kSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+
+  // EOI, SOI. Expect pass.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+  }
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(libyuvTest, InvalidateJpeg) {
+  const int kOff = 10;
+  const int kMinJpeg = 64;
+  const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+    benchmark_width_ * benchmark_height_ : kMinJpeg;
+  const int kSize = kImageSize + kOff;
+  align_buffer_64(orig_pixels, kSize);
+
+  // No SOI or EOI. Expect fail.
+  memset(orig_pixels, 0, kSize);
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  // SOI but no EOI. Expect fail.
+  orig_pixels[0] = 0xff;
+  orig_pixels[1] = 0xd8;  // SOI.
+  for (int times = 0; times < benchmark_iterations_; ++times) {
+    EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+  }
+  // EOI but no SOI. Expect fail.
+  orig_pixels[0] = 0;
+  orig_pixels[1] = 0;
+  orig_pixels[kSize - kOff + 0] = 0xff;
+  orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
+  EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+  free_aligned_buffer_page_end(orig_pixels);
+}
+
+#endif
+
+}  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc
index 67c489cfc93..45579b8913e 100644
--- a/chromium/third_party/libyuv/unit_test/cpu_test.cc
+++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc
@@ -41,6 +41,8 @@ TEST_F(libyuvTest, TestCpuHas) {
   printf("Has AVX2 %x\n", has_avx2);
   int has_erms = TestCpuFlag(kCpuHasERMS);
   printf("Has ERMS %x\n", has_erms);
+  int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+  printf("Has FMA3 %x\n", has_fma3);
   int has_mips = TestCpuFlag(kCpuHasMIPS);
   printf("Has MIPS %x\n", has_mips);
   int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
@@ -54,7 +56,7 @@ TEST_F(libyuvTest, TestCpuHas) {
 TEST_F(libyuvTest, TestCpuId) {
   int has_x86 = TestCpuFlag(kCpuHasX86);
   if (has_x86) {
-    int cpu_info[4];
+    uint32 cpu_info[4];
     // Vendor ID:
     // AuthenticAMD AMD processor
     // CentaurHauls Centaur processor
@@ -66,7 +68,7 @@ TEST_F(libyuvTest, TestCpuId) {
     // RiseRiseRise Rise Technology processor
     // SiS SiS SiS  SiS processor
     // UMC UMC UMC  UMC processor
-    CpuId(cpu_info, 0);
+    CpuId(0, 0, cpu_info);
     cpu_info[0] = cpu_info[1];  // Reorder output
     cpu_info[1] = cpu_info[3];
     cpu_info[3] = 0;
@@ -81,7 +83,7 @@ TEST_F(libyuvTest, TestCpuId) {
     // 13:12 - Processor Type
     // 19:16 - Extended Model
     // 27:20 - Extended Family
-    CpuId(cpu_info, 1);
+    CpuId(1, 0, cpu_info);
     int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
     int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
     printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
@@ -93,10 +95,8 @@ TEST_F(libyuvTest, TestCpuId) {
 TEST_F(libyuvTest, TestLinuxNeon) {
   int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
   if (testdata) {
-    EXPECT_EQ(0,
-              ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
-    EXPECT_EQ(kCpuHasNEON,
-              ArmCpuCaps("unit_test/testdata/tegra3.txt"));
+    EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
+    EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt"));
   } else {
     printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
   }
diff --git a/chromium/third_party/libyuv/unit_test/math_test.cc b/chromium/third_party/libyuv/unit_test/math_test.cc
new file mode 100644
index 00000000000..4095c122eb6
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/math_test.cc
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(libyuvTest, TestFixedDiv) {
+  int num[256];
+  int div[256];
+  int result_opt[256];
+  int result_c[256];
+
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
+  EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
+  EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
+  EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
+  EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
+  EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
+  EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
+  EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
+  EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
+  EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
+  EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
+  EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
+  EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
+  EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+  for (int i = 1; i < 4100; ++i) {
+    EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i));
+    EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
+    EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
+    EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
+    EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
+    EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
+  }
+  EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+  srandom(time(NULL));
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 256; ++j) {
+    if (div[j] == 0) {
+      div[j] = 1280;
+    }
+  }
+  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+    for (int j = 0; j < 256; ++j) {
+      result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+    }
+  }
+  for (int j = 0; j < 256; ++j) {
+    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+TEST_F(libyuvTest, TestFixedDiv_Opt) {
+  int num[256];
+  int div[256];
+  int result_opt[256];
+  int result_c[256];
+
+  srandom(time(NULL));
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 256; ++j) {
+    num[j] &= 4095;  // Make numerator smaller.
+    div[j] &= 4095;  // Make divisor smaller.
+    if (div[j] == 0) {
+      div[j] = 1280;
+    }
+  }
+
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+    if (has_x86) {
+      for (int j = 0; j < 256; ++j) {
+        result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+      }
+    } else {
+      for (int j = 0; j < 256; ++j) {
+        result_opt[j] = libyuv::FixedDiv_C(num[j], div[j]);
+      }
+    }
+  }
+  for (int j = 0; j < 256; ++j) {
+    result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
+}  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc
index 2c9958baae1..7759db406ff 100644
--- a/chromium/third_party/libyuv/unit_test/planar_test.cc
+++ b/chromium/third_party/libyuv/unit_test/planar_test.cc
@@ -32,77 +32,83 @@
 namespace libyuv {
 
 TEST_F(libyuvTest, TestAttenuate) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 atten_pixels[256][4]);
-  SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
-  SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
+  const int kSize = 1280 * 4;
+  align_buffer_64(orig_pixels, kSize);
+  align_buffer_64(atten_pixels, kSize);
+  align_buffer_64(unatten_pixels, kSize);
+  align_buffer_64(atten2_pixels, kSize);
 
   // Test unattenuation clamps
-  orig_pixels[0][0] = 200u;
-  orig_pixels[0][1] = 129u;
-  orig_pixels[0][2] = 127u;
-  orig_pixels[0][3] = 128u;
+  orig_pixels[0 * 4 + 0] = 200u;
+  orig_pixels[0 * 4 + 1] = 129u;
+  orig_pixels[0 * 4 + 2] = 127u;
+  orig_pixels[0 * 4 + 3] = 128u;
   // Test unattenuation transparent and opaque are unaffected
-  orig_pixels[1][0] = 16u;
-  orig_pixels[1][1] = 64u;
-  orig_pixels[1][2] = 192u;
-  orig_pixels[1][3] = 0u;
-  orig_pixels[2][0] = 16u;
-  orig_pixels[2][1] = 64u;
-  orig_pixels[2][2] = 192u;
-  orig_pixels[2][3] = 255u;
-  orig_pixels[3][0] = 16u;
-  orig_pixels[3][1] = 64u;
-  orig_pixels[3][2] = 192u;
-  orig_pixels[3][3] = 128u;
-  ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
-  EXPECT_EQ(255u, unatten_pixels[0][0]);
-  EXPECT_EQ(255u, unatten_pixels[0][1]);
-  EXPECT_EQ(254u, unatten_pixels[0][2]);
-  EXPECT_EQ(128u, unatten_pixels[0][3]);
-  EXPECT_EQ(0u, unatten_pixels[1][0]);
-  EXPECT_EQ(0u, unatten_pixels[1][1]);
-  EXPECT_EQ(0u, unatten_pixels[1][2]);
-  EXPECT_EQ(0u, unatten_pixels[1][3]);
-  EXPECT_EQ(16u, unatten_pixels[2][0]);
-  EXPECT_EQ(64u, unatten_pixels[2][1]);
-  EXPECT_EQ(192u, unatten_pixels[2][2]);
-  EXPECT_EQ(255u, unatten_pixels[2][3]);
-  EXPECT_EQ(32u, unatten_pixels[3][0]);
-  EXPECT_EQ(128u, unatten_pixels[3][1]);
-  EXPECT_EQ(255u, unatten_pixels[3][2]);
-  EXPECT_EQ(128u, unatten_pixels[3][3]);
-
-  for (int i = 0; i < 256; ++i) {
-    orig_pixels[i][0] = i;
-    orig_pixels[i][1] = i / 2;
-    orig_pixels[i][2] = i / 3;
-    orig_pixels[i][3] = i;
-  }
-  ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1);
-  ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1);
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1);
-  }
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2);
-    EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2);
-    EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2);
-    EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2);
+  orig_pixels[1 * 4 + 0] = 16u;
+  orig_pixels[1 * 4 + 1] = 64u;
+  orig_pixels[1 * 4 + 2] = 192u;
+  orig_pixels[1 * 4 + 3] = 0u;
+  orig_pixels[2 * 4 + 0] = 16u;
+  orig_pixels[2 * 4 + 1] = 64u;
+  orig_pixels[2 * 4 + 2] = 192u;
+  orig_pixels[2 * 4 + 3] = 255u;
+  orig_pixels[3 * 4 + 0] = 16u;
+  orig_pixels[3 * 4 + 1] = 64u;
+  orig_pixels[3 * 4 + 2] = 192u;
+  orig_pixels[3 * 4 + 3] = 128u;
+  ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
+  EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
+  EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
+  EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
+  EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
+  EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
+  EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
+  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
+  EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
+  EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i * 4 + 0] = i;
+    orig_pixels[i * 4 + 1] = i / 2;
+    orig_pixels[i * 4 + 2] = i / 3;
+    orig_pixels[i * 4 + 3] = i;
+  }
+  ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 1280, 1);
+  ARGBUnattenuate(atten_pixels, 0, unatten_pixels, 0, 1280, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
+    EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
   }
   // Make sure transparent, 50% and opaque are fully accurate.
-  EXPECT_EQ(0, atten_pixels[0][0]);
-  EXPECT_EQ(0, atten_pixels[0][1]);
-  EXPECT_EQ(0, atten_pixels[0][2]);
-  EXPECT_EQ(0, atten_pixels[0][3]);
-  EXPECT_EQ(64, atten_pixels[128][0]);
-  EXPECT_EQ(32, atten_pixels[128][1]);
-  EXPECT_EQ(21,  atten_pixels[128][2]);
-  EXPECT_EQ(128, atten_pixels[128][3]);
-  EXPECT_NEAR(255, atten_pixels[255][0], 1);
-  EXPECT_NEAR(127, atten_pixels[255][1], 1);
-  EXPECT_NEAR(85,  atten_pixels[255][2], 1);
-  EXPECT_EQ(255, atten_pixels[255][3]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 1]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 2]);
+  EXPECT_EQ(0, atten_pixels[0 * 4 + 3]);
+  EXPECT_EQ(64, atten_pixels[128 * 4 + 0]);
+  EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
+  EXPECT_EQ(21,  atten_pixels[128 * 4 + 2]);
+  EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
+  EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
+  EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
+  EXPECT_NEAR(85,  atten_pixels[255 * 4 + 2], 1);
+  EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
+
+  free_aligned_buffer_64(atten2_pixels)
+  free_aligned_buffer_64(unatten_pixels)
+  free_aligned_buffer_64(atten_pixels)
+  free_aligned_buffer_64(orig_pixels)
 }
 
 static int TestAttenuateI(int width, int height, int benchmark_iterations,
@@ -268,7 +274,9 @@ TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
 }
 
 TEST_F(libyuvTest, TestARGBGray) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
   // Test blue
   orig_pixels[0][0] = 255u;
   orig_pixels[0][1] = 0u;
@@ -325,20 +333,22 @@ TEST_F(libyuvTest, TestARGBGray) {
   EXPECT_EQ(96u, orig_pixels[5][1]);
   EXPECT_EQ(96u, orig_pixels[5][2]);
   EXPECT_EQ(224u, orig_pixels[5][3]);
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBGray(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
   }
 }
 
 TEST_F(libyuvTest, TestARGBGrayTo) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 gray_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 gray_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
   // Test blue
   orig_pixels[0][0] = 255u;
   orig_pixels[0][1] = 0u;
@@ -395,19 +405,20 @@ TEST_F(libyuvTest, TestARGBGrayTo) {
   EXPECT_EQ(96u, gray_pixels[5][1]);
   EXPECT_EQ(96u, gray_pixels[5][2]);
   EXPECT_EQ(224u, gray_pixels[5][3]);
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
   }
 }
 
 TEST_F(libyuvTest, TestARGBSepia) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Test blue
   orig_pixels[0][0] = 255u;
@@ -466,27 +477,106 @@ TEST_F(libyuvTest, TestARGBSepia) {
   EXPECT_EQ(127u, orig_pixels[5][2]);
   EXPECT_EQ(224u, orig_pixels[5][3]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
   }
 }
 
 TEST_F(libyuvTest, TestARGBColorMatrix) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
 
   // Matrix for Sepia.
-  static const int8 kARGBToSepia[] = {
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+    17 / 2, 68 / 2, 35 / 2, 0,
+    22 / 2, 88 / 2, 45 / 2, 0,
+    24 / 2, 98 / 2, 50 / 2, 0,
+    0, 0, 0, 64,  // Copy alpha.
+  };
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                  &kRGBToSepia[0], 16, 1);
+  EXPECT_EQ(31u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(43u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(47u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(135u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(175u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(195u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(67u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(87u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(99u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(87u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(112u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(127u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  MaskCpuFlags(0);
+  ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                  &kRGBToSepia[0], 1280, 1);
+  MaskCpuFlags(-1);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                    &kRGBToSepia[0], 1280, 1);
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+}
+
+TEST_F(libyuvTest, TestRGBColorMatrix) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+
+  // Matrix for Sepia.
+  SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
     17, 68, 35, 0,
     22, 88, 45, 0,
     24, 98, 50, 0,
     0, 0, 0, 0,  // Unused but makes matrix 16 bytes.
   };
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Test blue
   orig_pixels[0][0] = 255u;
@@ -509,8 +599,8 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
   orig_pixels[3][2] = 192u;
   orig_pixels[3][3] = 224u;
   // Do 16 to test asm version.
-  ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1);
-  EXPECT_EQ(33u, orig_pixels[0][0]);
+  RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1);
+  EXPECT_EQ(31u, orig_pixels[0][0]);
   EXPECT_EQ(43u, orig_pixels[0][1]);
   EXPECT_EQ(47u, orig_pixels[0][2]);
   EXPECT_EQ(128u, orig_pixels[0][3]);
@@ -518,28 +608,28 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
   EXPECT_EQ(175u, orig_pixels[1][1]);
   EXPECT_EQ(195u, orig_pixels[1][2]);
   EXPECT_EQ(0u, orig_pixels[1][3]);
-  EXPECT_EQ(69u, orig_pixels[2][0]);
-  EXPECT_EQ(89u, orig_pixels[2][1]);
+  EXPECT_EQ(67u, orig_pixels[2][0]);
+  EXPECT_EQ(87u, orig_pixels[2][1]);
   EXPECT_EQ(99u, orig_pixels[2][2]);
   EXPECT_EQ(255u, orig_pixels[2][3]);
-  EXPECT_EQ(88u, orig_pixels[3][0]);
-  EXPECT_EQ(114u, orig_pixels[3][1]);
+  EXPECT_EQ(87u, orig_pixels[3][0]);
+  EXPECT_EQ(112u, orig_pixels[3][1]);
   EXPECT_EQ(127u, orig_pixels[3][2]);
   EXPECT_EQ(224u, orig_pixels[3][3]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1);
   }
 }
 
 TEST_F(libyuvTest, TestARGBColorTable) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
   memset(orig_pixels, 0, sizeof(orig_pixels));
 
   // Matrix for Sepia.
@@ -585,67 +675,127 @@ TEST_F(libyuvTest, TestARGBColorTable) {
   EXPECT_EQ(11u, orig_pixels[3][2]);
   EXPECT_EQ(16u, orig_pixels[3][3]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
+  }
+}
+
+// Same as TestARGBColorTable except alpha does not change.
+TEST_F(libyuvTest, TestRGBColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  // Matrix for Sepia.
+  static const uint8 kARGBTable[256 * 4] = {
+    1u, 2u, 3u, 4u,
+    5u, 6u, 7u, 8u,
+    9u, 10u, 11u, 12u,
+    13u, 14u, 15u, 16u,
+  };
+
+  orig_pixels[0][0] = 0u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 0u;
+  orig_pixels[1][0] = 1u;
+  orig_pixels[1][1] = 1u;
+  orig_pixels[1][2] = 1u;
+  orig_pixels[1][3] = 1u;
+  orig_pixels[2][0] = 2u;
+  orig_pixels[2][1] = 2u;
+  orig_pixels[2][2] = 2u;
+  orig_pixels[2][3] = 2u;
+  orig_pixels[3][0] = 0u;
+  orig_pixels[3][1] = 1u;
+  orig_pixels[3][2] = 2u;
+  orig_pixels[3][3] = 3u;
+  // Do 16 to test asm version.
+  RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+  EXPECT_EQ(1u, orig_pixels[0][0]);
+  EXPECT_EQ(2u, orig_pixels[0][1]);
+  EXPECT_EQ(3u, orig_pixels[0][2]);
+  EXPECT_EQ(0u, orig_pixels[0][3]);  // Alpha unchanged.
+  EXPECT_EQ(5u, orig_pixels[1][0]);
+  EXPECT_EQ(6u, orig_pixels[1][1]);
+  EXPECT_EQ(7u, orig_pixels[1][2]);
+  EXPECT_EQ(1u, orig_pixels[1][3]);  // Alpha unchanged.
+  EXPECT_EQ(9u, orig_pixels[2][0]);
+  EXPECT_EQ(10u, orig_pixels[2][1]);
+  EXPECT_EQ(11u, orig_pixels[2][2]);
+  EXPECT_EQ(2u, orig_pixels[2][3]);  // Alpha unchanged.
+  EXPECT_EQ(1u, orig_pixels[3][0]);
+  EXPECT_EQ(6u, orig_pixels[3][1]);
+  EXPECT_EQ(11u, orig_pixels[3][2]);
+  EXPECT_EQ(3u, orig_pixels[3][3]);  // Alpha unchanged.
+
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
   }
 }
 
 TEST_F(libyuvTest, TestARGBQuantize) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i;
   }
   ARGBQuantize(&orig_pixels[0][0], 0,
-               (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+               (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
 
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]);
-    EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]);
-    EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]);
-    EXPECT_EQ(i, orig_pixels[i][3]);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
+    EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]);
+    EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]);
+    EXPECT_EQ(i & 255, orig_pixels[i][3]);
   }
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBQuantize(&orig_pixels[0][0], 0,
-                 (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+                 (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
   }
 }
 
 TEST_F(libyuvTest, TestARGBMirror) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 dst_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_pixels[i][0] = i;
     orig_pixels[i][1] = i / 2;
     orig_pixels[i][2] = i / 3;
     orig_pixels[i][3] = i / 4;
   }
-  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
 
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_EQ(i, dst_pixels[255 - i][0]);
-    EXPECT_EQ(i / 2, dst_pixels[255 - i][1]);
-    EXPECT_EQ(i / 3, dst_pixels[255 - i][2]);
-    EXPECT_EQ(i / 4, dst_pixels[255 - i][3]);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
+    EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
+    EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
+    EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
   }
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
   }
 }
 
 TEST_F(libyuvTest, TestShade) {
-  SIMD_ALIGNED(uint8 orig_pixels[256][4]);
-  SIMD_ALIGNED(uint8 shade_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 shade_pixels[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
 
   orig_pixels[0][0] = 10u;
   orig_pixels[0][1] = 20u;
@@ -694,16 +844,18 @@ TEST_F(libyuvTest, TestShade) {
   EXPECT_EQ(5u, shade_pixels[0][2]);
   EXPECT_EQ(5u, shade_pixels[0][3]);
 
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1,
               0x80808080);
   }
 }
 
 TEST_F(libyuvTest, TestInterpolate) {
-  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
-  SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
-  SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels[1280][4]);
+  memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+  memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
 
   orig_pixels_0[0][0] = 16u;
   orig_pixels_0[0][1] = 32u;
@@ -773,9 +925,9 @@ TEST_F(libyuvTest, TestInterpolate) {
   EXPECT_EQ(16u, interpolate_pixels[0][2]);
   EXPECT_EQ(32u, interpolate_pixels[0][3]);
 
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
-                    &interpolate_pixels[0][0], 0, 256, 1, 128);
+                    &interpolate_pixels[0][0], 0, 1280, 1, 128);
   }
 }
 
@@ -841,7 +993,6 @@ TESTINTERPOLATE(64)
 TESTINTERPOLATE(128)
 TESTINTERPOLATE(192)
 TESTINTERPOLATE(255)
-TESTINTERPOLATE(85)
 
 static int TestBlend(int width, int height, int benchmark_iterations,
                      int invert, int off) {
@@ -919,10 +1070,10 @@ TEST_F(libyuvTest, ARGBBlend_Opt) {
 }
 
 TEST_F(libyuvTest, TestAffine) {
-  SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
-  SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     for (int j = 0; j < 4; ++j) {
       orig_pixels_0[i][j] = i;
     }
@@ -931,42 +1082,42 @@ TEST_F(libyuvTest, TestAffine) {
   float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
 
   ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
-                  uv_step, 256);
+                  uv_step, 1280);
   EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
   EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
   EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
 
 #if defined(HAS_ARGBAFFINEROW_SSE2)
-  SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]);
+  SIMD_ALIGNED(uint8 interpolate_pixels_Opt[1280][4]);
   ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
-                     uv_step, 256);
-  EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4));
+                     uv_step, 1280);
+  EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
 
   int has_sse2 = TestCpuFlag(kCpuHasSSE2);
   if (has_sse2) {
-    for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+    for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
       ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
-                         uv_step, 256);
+                         uv_step, 1280);
     }
   }
 #endif
 }
 
 TEST_F(libyuvTest, TestSobelX) {
-  SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]);
-  SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
-  SIMD_ALIGNED(uint8 orig_pixels_2[256 + 2]);
-  SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]);
+  SIMD_ALIGNED(uint8 orig_pixels_2[1280 + 2]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
 
-  for (int i = 0; i < 256 + 2; ++i) {
+  for (int i = 0; i < 1280 + 2; ++i) {
     orig_pixels_0[i] = i;
     orig_pixels_1[i] = i * 2;
     orig_pixels_2[i] = i * 3;
   }
 
   SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2,
-              sobel_pixels_c, 256);
+              sobel_pixels_c, 1280);
 
   EXPECT_EQ(16u, sobel_pixels_c[0]);
   EXPECT_EQ(16u, sobel_pixels_c[100]);
@@ -975,9 +1126,9 @@ TEST_F(libyuvTest, TestSobelX) {
   void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
                     const uint8* src_y2, uint8* dst_sobely, int width) =
       SobelXRow_C;
-#if defined(HAS_SOBELXROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SobelXRow = SobelXRow_SSSE3;
+#if defined(HAS_SOBELXROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelXRow = SobelXRow_SSE2;
   }
 #endif
 #if defined(HAS_SOBELXROW_NEON)
@@ -985,36 +1136,36 @@ TEST_F(libyuvTest, TestSobelX) {
     SobelXRow = SobelXRow_NEON;
   }
 #endif
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
     SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2,
-              sobel_pixels_opt, 256);
+              sobel_pixels_opt, 1280);
   }
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
   }
 }
 
 TEST_F(libyuvTest, TestSobelY) {
-  SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]);
-  SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
-  SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
+  SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]);
+  SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
 
-  for (int i = 0; i < 256 + 2; ++i) {
+  for (int i = 0; i < 1280 + 2; ++i) {
     orig_pixels_0[i] = i;
     orig_pixels_1[i] = i * 2;
   }
 
-  SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 256);
+  SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 1280);
 
   EXPECT_EQ(4u, sobel_pixels_c[0]);
   EXPECT_EQ(255u, sobel_pixels_c[100]);
   EXPECT_EQ(0u, sobel_pixels_c[255]);
   void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
                     uint8* dst_sobely, int width) = SobelYRow_C;
-#if defined(HAS_SOBELYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    SobelYRow = SobelYRow_SSSE3;
+#if defined(HAS_SOBELYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelYRow = SobelYRow_SSE2;
   }
 #endif
 #if defined(HAS_SOBELYROW_NEON)
@@ -1022,26 +1173,26 @@ TEST_F(libyuvTest, TestSobelY) {
     SobelYRow = SobelYRow_NEON;
   }
 #endif
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 1280);
   }
-  for (int i = 0; i < 256; ++i) {
-    EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
   }
 }
 
 TEST_F(libyuvTest, TestSobel) {
-  SIMD_ALIGNED(uint8 orig_sobelx[256]);
-  SIMD_ALIGNED(uint8 orig_sobely[256]);
-  SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
+  SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+  SIMD_ALIGNED(uint8 orig_sobely[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_sobelx[i] = i;
     orig_sobely[i] = i * 2;
   }
 
-  SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256);
+  SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
 
   EXPECT_EQ(0u, sobel_pixels_c[0]);
   EXPECT_EQ(3u, sobel_pixels_c[4]);
@@ -1066,26 +1217,64 @@ TEST_F(libyuvTest, TestSobel) {
     SobelRow = SobelRow_NEON;
   }
 #endif
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
   }
-  for (int i = 0; i < 16; ++i) {
-    EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+  for (int i = 0; i < 1280 * 4; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
+  }
+}
+
+TEST_F(libyuvTest, TestSobelToPlane) {
+  SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+  SIMD_ALIGNED(uint8 orig_sobely[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_sobelx[i] = i;
+    orig_sobely[i] = i * 2;
+  }
+
+  SobelToPlaneRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
+
+  EXPECT_EQ(0u, sobel_pixels_c[0]);
+  EXPECT_EQ(3u, sobel_pixels_c[1]);
+  EXPECT_EQ(6u, sobel_pixels_c[2]);
+  EXPECT_EQ(99u, sobel_pixels_c[33]);
+  EXPECT_EQ(255u, sobel_pixels_c[100]);
+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+                          uint8* dst_y, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    SobelToPlaneRow = SobelToPlaneRow_SSE2;
+  }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SobelToPlaneRow = SobelToPlaneRow_NEON;
+  }
+#endif
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelToPlaneRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
   }
 }
 
 TEST_F(libyuvTest, TestSobelXY) {
-  SIMD_ALIGNED(uint8 orig_sobelx[256]);
-  SIMD_ALIGNED(uint8 orig_sobely[256]);
-  SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
-  SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
+  SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+  SIMD_ALIGNED(uint8 orig_sobely[1280]);
+  SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]);
+  SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]);
 
-  for (int i = 0; i < 256; ++i) {
+  for (int i = 0; i < 1280; ++i) {
     orig_sobelx[i] = i;
     orig_sobely[i] = i * 2;
   }
 
-  SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256);
+  SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
 
   EXPECT_EQ(0u, sobel_pixels_c[0]);
   EXPECT_EQ(2u, sobel_pixels_c[4]);
@@ -1106,11 +1295,11 @@ TEST_F(libyuvTest, TestSobelXY) {
     SobelXYRow = SobelXYRow_NEON;
   }
 #endif
-  for (int i = 0; i < benchmark_pixels_div256_; ++i) {
-    SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
   }
-  for (int i = 0; i < 16; ++i) {
-    EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+  for (int i = 0; i < 1280 * 4; ++i) {
+    EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
   }
 }
 
@@ -1165,8 +1354,6 @@ TEST_F(libyuvTest, TestCopyPlane) {
     CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
   }
   opt_time = (get_time() - opt_time) / benchmark_iterations_;
-  printf(" %8d us C - %8d us OPT\n",
-         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
 
   for (i = 0; i < y_plane_size; ++i) {
     if (dst_c[i] != dst_opt[i])
@@ -1403,6 +1590,7 @@ static int TestSobel(int width, int height, int benchmark_iterations,
   align_buffer_64(src_argb_a, kStride * height + off);
   align_buffer_64(dst_argb_c, kStride * height);
   align_buffer_64(dst_argb_opt, kStride * height);
+  memset(src_argb_a, 0, kStride * height + off);
   srandom(time(NULL));
   for (int i = 0; i < kStride * height; ++i) {
     src_argb_a[i + off] = (random() & 0xff);
@@ -1459,6 +1647,75 @@ TEST_F(libyuvTest, ARGBSobel_Opt) {
   EXPECT_EQ(0, max_diff);
 }
 
+static int TestSobelToPlane(int width, int height, int benchmark_iterations,
+                            int invert, int off) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kSrcBpp = 4;
+  const int kDstBpp = 1;
+  const int kSrcStride = (width * kSrcBpp + 15) & ~15;
+  const int kDstStride = (width * kDstBpp + 15) & ~15;
+  align_buffer_64(src_argb_a, kSrcStride * height + off);
+  align_buffer_64(dst_argb_c, kDstStride * height);
+  align_buffer_64(dst_argb_opt, kDstStride * height);
+  memset(src_argb_a, 0, kSrcStride * height + off);
+  srandom(time(NULL));
+  for (int i = 0; i < kSrcStride * height; ++i) {
+    src_argb_a[i + off] = (random() & 0xff);
+  }
+  memset(dst_argb_c, 0, kDstStride * height);
+  memset(dst_argb_opt, 0, kDstStride * height);
+
+  MaskCpuFlags(0);
+  ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+                   dst_argb_c, kDstStride,
+                   width, invert * height);
+  MaskCpuFlags(-1);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+                     dst_argb_opt, kDstStride,
+                     width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kDstStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a)
+  free_aligned_buffer_64(dst_argb_c)
+  free_aligned_buffer_64(dst_argb_opt)
+  return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Any) {
+  int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+                                  benchmark_iterations_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Unaligned) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, +1, 1);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Invert) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, -1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Opt) {
+  int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+                                  benchmark_iterations_, +1, 0);
+  EXPECT_EQ(0, max_diff);
+}
+
 static int TestSobelXY(int width, int height, int benchmark_iterations,
                      int invert, int off) {
   if (width < 1) {
@@ -1469,6 +1726,7 @@ static int TestSobelXY(int width, int height, int benchmark_iterations,
   align_buffer_64(src_argb_a, kStride * height + off);
   align_buffer_64(dst_argb_c, kStride * height);
   align_buffer_64(dst_argb_opt, kStride * height);
+  memset(src_argb_a, 0, kStride * height + off);
   srandom(time(NULL));
   for (int i = 0; i < kStride * height; ++i) {
     src_argb_a[i + off] = (random() & 0xff);
@@ -1525,4 +1783,326 @@ TEST_F(libyuvTest, ARGBSobelXY_Opt) {
   EXPECT_EQ(0, max_diff);
 }
 
+static int TestBlur(int width, int height, int benchmark_iterations,
+                    int invert, int off, int radius) {
+  if (width < 1) {
+    width = 1;
+  }
+  const int kBpp = 4;
+  const int kStride = (width * kBpp + 15) & ~15;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(dst_cumsum, width * height * 16);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  srandom(time(NULL));
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb_a[i + off] = (random() & 0xff);
+  }
+  memset(dst_cumsum, 0, width * height * 16);
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(0);
+  ARGBBlur(src_argb_a + off, kStride,
+           dst_argb_c, kStride,
+           reinterpret_cast<int32*>(dst_cumsum), width * 4,
+           width, invert * height, radius);
+  MaskCpuFlags(-1);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBBlur(src_argb_a + off, kStride,
+             dst_argb_opt, kStride,
+             reinterpret_cast<int32*>(dst_cumsum), width * 4,
+             width, invert * height, radius);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb_a)
+  free_aligned_buffer_64(dst_cumsum)
+  free_aligned_buffer_64(dst_argb_c)
+  free_aligned_buffer_64(dst_argb_opt)
+  return max_diff;
+}
+
+static const int kBlurSize = 55;
+TEST_F(libyuvTest, ARGBBlur_Any) {
+  int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+                          benchmark_iterations_, +1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlur_Unaligned) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_, +1, 1, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlur_Invert) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_, -1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlur_Opt) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_, +1, 0, kBlurSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+static const int kBlurSmallSize = 5;
+TEST_F(libyuvTest, ARGBBlurSmall_Any) {
+  int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+                          benchmark_iterations_, +1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlurSmall_Unaligned) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_, +1, 1, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlurSmall_Invert) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_, -1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlurSmall_Opt) {
+  int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+                          benchmark_iterations_, +1, 0, kBlurSmallSize);
+  EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, TestARGBPolynomial) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
+    0.94230f,  -3.03300f,    -2.92500f,  0.f,  // C0
+    0.584500f,  1.112000f,    1.535000f, 1.f,  // C1 x
+    0.001313f, -0.002503f,   -0.004496f, 0.f,  // C2 x * x
+    0.0f,       0.000006965f, 0.000008781f, 0.f,  // C3 x * x * x
+  };
+
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test white
+  orig_pixels[3][0] = 255u;
+  orig_pixels[3][1] = 255u;
+  orig_pixels[3][2] = 255u;
+  orig_pixels[3][3] = 255u;
+  // Test color
+  orig_pixels[4][0] = 16u;
+  orig_pixels[4][1] = 64u;
+  orig_pixels[4][2] = 192u;
+  orig_pixels[4][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                 &kWarmifyPolynomial[0], 16, 1);
+  EXPECT_EQ(235u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(235u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(233u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(241u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[3][3]);
+  EXPECT_EQ(10u, dst_pixels_opt[4][0]);
+  EXPECT_EQ(59u, dst_pixels_opt[4][1]);
+  EXPECT_EQ(188u, dst_pixels_opt[4][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[4][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  MaskCpuFlags(0);
+  ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                 &kWarmifyPolynomial[0], 1280, 1);
+  MaskCpuFlags(-1);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                   &kWarmifyPolynomial[0], 1280, 1);
+  }
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+}
+
+TEST_F(libyuvTest, TestARGBLumaColorTable) {
+  SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+  SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+  memset(orig_pixels, 0, sizeof(orig_pixels));
+
+  align_buffer_64(lumacolortable, 32768);
+  int v = 0;
+  for (int i = 0; i < 32768; ++i) {
+    lumacolortable[i] = v;
+    v += 3;
+  }
+  // Test blue
+  orig_pixels[0][0] = 255u;
+  orig_pixels[0][1] = 0u;
+  orig_pixels[0][2] = 0u;
+  orig_pixels[0][3] = 128u;
+  // Test green
+  orig_pixels[1][0] = 0u;
+  orig_pixels[1][1] = 255u;
+  orig_pixels[1][2] = 0u;
+  orig_pixels[1][3] = 0u;
+  // Test red
+  orig_pixels[2][0] = 0u;
+  orig_pixels[2][1] = 0u;
+  orig_pixels[2][2] = 255u;
+  orig_pixels[2][3] = 255u;
+  // Test color
+  orig_pixels[3][0] = 16u;
+  orig_pixels[3][1] = 64u;
+  orig_pixels[3][2] = 192u;
+  orig_pixels[3][3] = 224u;
+  // Do 16 to test asm version.
+  ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                     &lumacolortable[0], 16, 1);
+  EXPECT_EQ(253u, dst_pixels_opt[0][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+  EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+  EXPECT_EQ(253u, dst_pixels_opt[1][1]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+  EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+  EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+  EXPECT_EQ(253u, dst_pixels_opt[2][2]);
+  EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+  EXPECT_EQ(48u, dst_pixels_opt[3][0]);
+  EXPECT_EQ(192u, dst_pixels_opt[3][1]);
+  EXPECT_EQ(64u, dst_pixels_opt[3][2]);
+  EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i][0] = i;
+    orig_pixels[i][1] = i / 2;
+    orig_pixels[i][2] = i / 3;
+    orig_pixels[i][3] = i;
+  }
+
+  MaskCpuFlags(0);
+  ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+                     lumacolortable, 1280, 1);
+  MaskCpuFlags(-1);
+
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+                       lumacolortable, 1280, 1);
+  }
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+    EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+    EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+    EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+  }
+
+  free_aligned_buffer_64(lumacolortable);
+}
+
+TEST_F(libyuvTest, TestARGBCopyAlpha) {
+  const int kSize = benchmark_width_ * benchmark_height_ * 4;
+  align_buffer_64(orig_pixels, kSize);
+  align_buffer_64(dst_pixels_opt, kSize);
+  align_buffer_64(dst_pixels_c, kSize);
+
+  MemRandomize(orig_pixels, kSize);
+  MemRandomize(dst_pixels_opt, kSize);
+  memcpy(dst_pixels_c, dst_pixels_opt, kSize);
+
+  MaskCpuFlags(0);
+  ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+                dst_pixels_c, benchmark_width_ * 4,
+                benchmark_width_, benchmark_height_);
+  MaskCpuFlags(-1);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+                  dst_pixels_opt, benchmark_width_ * 4,
+                  benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < kSize; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_64(dst_pixels_c)
+  free_aligned_buffer_64(dst_pixels_opt)
+  free_aligned_buffer_64(orig_pixels)
+}
+
+TEST_F(libyuvTest, TestARGBCopyYToAlpha) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_64(orig_pixels, kPixels);
+  align_buffer_64(dst_pixels_opt, kPixels * 4);
+  align_buffer_64(dst_pixels_c, kPixels * 4);
+
+  MemRandomize(orig_pixels, kPixels);
+  MemRandomize(dst_pixels_opt, kPixels * 4);
+  memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4);
+
+  MaskCpuFlags(0);
+  ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+                   dst_pixels_c, benchmark_width_ * 4,
+                   benchmark_width_, benchmark_height_);
+  MaskCpuFlags(-1);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+                     dst_pixels_opt, benchmark_width_ * 4,
+                     benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < kPixels * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_64(dst_pixels_c)
+  free_aligned_buffer_64(dst_pixels_opt)
+  free_aligned_buffer_64(orig_pixels)
+}
+
 }  // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
index 7a4758594a0..ea4d4d14580 100644
--- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
@@ -17,10 +17,6 @@
 
 namespace libyuv {
 
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int ARGBTestFilter(int src_width, int src_height,
                           int dst_width, int dst_height,
@@ -85,7 +81,7 @@ static int ARGBTestFilter(int src_width, int src_height,
   int max_diff = 0;
   for (i = b; i < (dst_height + b); ++i) {
     for (j = b * 4; j < (dst_width + b) * 4; ++j) {
-      int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
                          dst_argb_opt[(i * dst_stride_argb) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
@@ -99,8 +95,8 @@ static int ARGBTestFilter(int src_width, int src_height,
   return max_diff;
 }
 
-static const int kTileX = 16;
-static const int kTileY = 16;
+static const int kTileX = 8;
+static const int kTileY = 8;
 
 static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
                          int src_width, int src_height,
@@ -184,7 +180,7 @@ static int ARGBClipTestFilter(int src_width, int src_height,
   int max_diff = 0;
   for (i = b; i < (dst_height + b); ++i) {
     for (j = b * 4; j < (dst_width + b) * 4; ++j) {
-      int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+      int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
                          dst_argb_opt[(i * dst_stride_argb) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
@@ -198,78 +194,83 @@ static int ARGBClipTestFilter(int src_width, int src_height,
   return max_diff;
 }
 
-#define TEST_FACTOR1(name, filter, factor, max_diff)                           \
+#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff)                 \
     TEST_F(libyuvTest, ARGBScaleDownBy##name##_##filter) {                     \
       int diff = ARGBTestFilter(benchmark_width_, benchmark_height_,           \
-                                Abs(benchmark_width_) / factor,                \
-                                Abs(benchmark_height_) / factor,               \
+                                Abs(benchmark_width_) * hfactor,               \
+                                Abs(benchmark_height_) * vfactor,              \
                                 kFilter##filter, benchmark_iterations_);       \
       EXPECT_LE(diff, max_diff);                                               \
     }                                                                          \
     TEST_F(libyuvTest, ARGBScaleDownClipBy##name##_##filter) {                 \
       int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_,       \
-                                Abs(benchmark_width_) / factor,                \
-                                Abs(benchmark_height_) / factor,               \
+                                Abs(benchmark_width_) * hfactor,               \
+                                Abs(benchmark_height_) * vfactor,              \
                                 kFilter##filter, benchmark_iterations_);       \
       EXPECT_LE(diff, max_diff);                                               \
     }
 
-// Test a scale factor with all 2 filters.  Expect unfiltered to be exact, but
+// Test a scale factor with 2 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, factor)                                              \
-    TEST_FACTOR1(name, None, factor, 0)                                        \
-    TEST_FACTOR1(name, Bilinear, factor, 2)
+#define TEST_FACTOR(name, hfactor, vfactor)                                    \
+    TEST_FACTOR1(name, None, hfactor, vfactor, 2)                              \
+    TEST_FACTOR1(name, Linear, hfactor, vfactor, 2)                            \
+    TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 2)                          \
+    TEST_FACTOR1(name, Box, hfactor, vfactor, 2)
 
 // TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
-TEST_FACTOR(1, 1)
-TEST_FACTOR(2, 2)
-TEST_FACTOR(4, 4)
-TEST_FACTOR(5, 5)
-TEST_FACTOR(8, 8)
-TEST_FACTOR(16, 16)
-TEST_FACTOR(2by3, 2 / 3)
-TEST_FACTOR(3by4, 3 / 4)
-TEST_FACTOR(3by8, 3 / 8)
+TEST_FACTOR(1, 1 / 1, 1 / 1)
+TEST_FACTOR(2, 1 / 2, 1 / 2)
+TEST_FACTOR(4, 1 / 4, 1 / 4)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(16, 1 / 16, 1 / 16)
+TEST_FACTOR(2by3, 2 / 3, 2 / 3)
+TEST_FACTOR(3by4, 3 / 4, 3 / 4)
+TEST_FACTOR(3by8, 3 / 8, 3 / 8)
+TEST_FACTOR(Vertical2by3, 1, 2 / 3)
 #undef TEST_FACTOR1
 #undef TEST_FACTOR
 
-#define TEST_SCALETO1(width, height, filter, max_diff)                         \
-    TEST_F(libyuvTest, ARGBScaleTo##width##x##height##_##filter) {             \
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+    TEST_F(libyuvTest, name##To##width##x##height##_##filter) {                \
       int diff = ARGBTestFilter(benchmark_width_, benchmark_height_,           \
                                 width, height,                                 \
                                 kFilter##filter, benchmark_iterations_);       \
       EXPECT_LE(diff, max_diff);                                               \
     }                                                                          \
-    TEST_F(libyuvTest, ARGBScaleFrom##width##x##height##_##filter) {           \
+    TEST_F(libyuvTest, name##From##width##x##height##_##filter) {              \
       int diff = ARGBTestFilter(width, height,                                 \
                                 Abs(benchmark_width_), Abs(benchmark_height_), \
                                 kFilter##filter, benchmark_iterations_);       \
       EXPECT_LE(diff, max_diff);                                               \
     }                                                                          \
-    TEST_F(libyuvTest, ARGBScaleClipTo##width##x##height##_##filter) {         \
+    TEST_F(libyuvTest, name##ClipTo##width##x##height##_##filter) {            \
       int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_,       \
                                 width, height,                                 \
                                 kFilter##filter, benchmark_iterations_);       \
       EXPECT_LE(diff, max_diff);                                               \
     }                                                                          \
-    TEST_F(libyuvTest, ARGBScaleClipFrom##width##x##height##_##filter) {       \
+    TEST_F(libyuvTest, name##ClipFrom##width##x##height##_##filter) {          \
       int diff = ARGBClipTestFilter(width, height,                             \
                                 Abs(benchmark_width_), Abs(benchmark_height_), \
                                 kFilter##filter, benchmark_iterations_);       \
       EXPECT_LE(diff, max_diff);                                               \
     }
 
-// Test scale to a specified size with all 3 filters.
-#define TEST_SCALETO(width, height)                                            \
-    TEST_SCALETO1(width, height, None, 0)                                      \
-    TEST_SCALETO1(width, height, Bilinear, 2)
-
-TEST_SCALETO(640, 360)
-TEST_SCALETO(853, 480)
-TEST_SCALETO(1280, 720)
-TEST_SCALETO(1280, 800)
-TEST_SCALETO(1366, 768)
-TEST_SCALETO(1920, 1080)
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)                                      \
+    TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 3)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 3)                            \
+    TEST_SCALETO1(name, width, height, Box, 3)
+
+TEST_SCALETO(ARGBScale, 1, 1)
+TEST_SCALETO(ARGBScale, 320, 240)
+TEST_SCALETO(ARGBScale, 352, 288)
+TEST_SCALETO(ARGBScale, 640, 360)
+TEST_SCALETO(ARGBScale, 853, 480)
+TEST_SCALETO(ARGBScale, 1280, 720)
+TEST_SCALETO(ARGBScale, 1920, 1080)
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc
index 769151aa232..c6f25604608 100644
--- a/chromium/third_party/libyuv/unit_test/scale_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_test.cc
@@ -17,10 +17,6 @@
 
 namespace libyuv {
 
-static __inline int Abs(int v) {
-  return v >= 0 ? v : -v;
-}
-
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
 static int TestFilter(int src_width, int src_height,
                       int dst_width, int dst_height,
@@ -99,7 +95,7 @@ static int TestFilter(int src_width, int src_height,
   int max_diff = 0;
   for (i = b; i < (dst_height + b); ++i) {
     for (j = b; j < (dst_width + b); ++j) {
-      int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
                          dst_y_opt[(i * dst_stride_y) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
@@ -109,12 +105,12 @@ static int TestFilter(int src_width, int src_height,
 
   for (i = b; i < (dst_height_uv + b); ++i) {
     for (j = b; j < (dst_width_uv + b); ++j) {
-      int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+      int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
                          dst_u_opt[(i * dst_stride_uv) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
       }
-      abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+      abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
                      dst_v_opt[(i * dst_stride_uv) + j]);
       if (abs_diff > max_diff) {
         max_diff = abs_diff;
@@ -136,61 +132,64 @@ static int TestFilter(int src_width, int src_height,
   return max_diff;
 }
 
-#define TEST_FACTOR1(name, filter, factor, max_diff)                           \
+#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff)                 \
     TEST_F(libyuvTest, ScaleDownBy##name##_##filter) {                         \
       int diff = TestFilter(benchmark_width_, benchmark_height_,               \
-                            Abs(benchmark_width_) / factor,                    \
-                            Abs(benchmark_height_) / factor,                   \
+                            Abs(benchmark_width_) * hfactor,                   \
+                            Abs(benchmark_height_) * vfactor,                  \
                             kFilter##filter, benchmark_iterations_);           \
       EXPECT_LE(diff, max_diff);                                               \
     }
 
-// Test a scale factor with all 3 filters.  Expect unfiltered to be exact, but
+// Test a scale factor with all 4 filters.  Expect unfiltered to be exact, but
 // filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, factor)                                              \
-    TEST_FACTOR1(name, None, factor, 0)                                        \
-    TEST_FACTOR1(name, Bilinear, factor, 2)                                    \
-    TEST_FACTOR1(name, Box, factor, 2)                                         \
+#define TEST_FACTOR(name, hfactor, vfactor)                                    \
+    TEST_FACTOR1(name, None, hfactor, vfactor, 0)                              \
+    TEST_FACTOR1(name, Linear, hfactor, vfactor, 3)                            \
+    TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 3)                          \
+    TEST_FACTOR1(name, Box, hfactor, vfactor, 3)                               \
 
 // TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
-TEST_FACTOR(1, 1)
-TEST_FACTOR(2, 2)
-TEST_FACTOR(4, 4)
-TEST_FACTOR(5, 5)
-TEST_FACTOR(8, 8)
-TEST_FACTOR(16, 16)
-TEST_FACTOR(2by3, 2 / 3)
-TEST_FACTOR(3by4, 3 / 4)
-TEST_FACTOR(3by8, 3 / 8)
+TEST_FACTOR(1, 1 / 1, 1 / 1)
+TEST_FACTOR(2, 1 / 2, 1 / 2)
+TEST_FACTOR(4, 1 / 4, 1 / 4)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(16, 1 / 16, 1 / 16)
+TEST_FACTOR(2by3, 2 / 3, 2 / 3)
+TEST_FACTOR(3by4, 3 / 4, 3 / 4)
+TEST_FACTOR(3by8, 3 / 8, 3 / 8)
+TEST_FACTOR(Vertical2by3, 1, 2 / 3)
 #undef TEST_FACTOR1
 #undef TEST_FACTOR
 
-#define TEST_SCALETO1(width, height, filter, max_diff)                         \
-    TEST_F(libyuvTest, ScaleTo##width##x##height##_##filter) {                 \
+#define TEST_SCALETO1(name, width, height, filter, max_diff)                   \
+    TEST_F(libyuvTest, name##To##width##x##height##_##filter) {                \
       int diff = TestFilter(benchmark_width_, benchmark_height_,               \
                             width, height,                                     \
                             kFilter##filter, benchmark_iterations_);           \
       EXPECT_LE(diff, max_diff);                                               \
     }                                                                          \
-    TEST_F(libyuvTest, ScaleFrom##width##x##height##_##filter) {               \
+    TEST_F(libyuvTest, name##From##width##x##height##_##filter) {              \
       int diff = TestFilter(width, height,                                     \
                             Abs(benchmark_width_), Abs(benchmark_height_),     \
                             kFilter##filter, benchmark_iterations_);           \
       EXPECT_LE(diff, max_diff);                                               \
     }
 
-// Test scale to a specified size with all 3 filters.
-#define TEST_SCALETO(width, height)                                            \
-    TEST_SCALETO1(width, height, None, 0)                                      \
-    TEST_SCALETO1(width, height, Bilinear, 2)                                  \
-    TEST_SCALETO1(width, height, Box, 2)                                       \
-
-TEST_SCALETO(640, 360)
-TEST_SCALETO(853, 480)
-TEST_SCALETO(1280, 720)
-TEST_SCALETO(1280, 800)
-TEST_SCALETO(1366, 768)
-TEST_SCALETO(1920, 1080)
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height)                                      \
+    TEST_SCALETO1(name, width, height, None, 0)                                \
+    TEST_SCALETO1(name, width, height, Linear, 3)                              \
+    TEST_SCALETO1(name, width, height, Bilinear, 3)                            \
+    TEST_SCALETO1(name, width, height, Box, 3)
+
+TEST_SCALETO(Scale, 1, 1)
+TEST_SCALETO(Scale, 320, 240)
+TEST_SCALETO(Scale, 352, 288)
+TEST_SCALETO(Scale, 640, 360)
+TEST_SCALETO(Scale, 853, 480)
+TEST_SCALETO(Scale, 1280, 720)
+TEST_SCALETO(Scale, 1920, 1080)
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.cc b/chromium/third_party/libyuv/unit_test/unit_test.cc
index fac70262133..b11bd246313 100644
--- a/chromium/third_party/libyuv/unit_test/unit_test.cc
+++ b/chromium/third_party/libyuv/unit_test/unit_test.cc
@@ -19,8 +19,8 @@
 #define BENCHMARK_ITERATIONS 1
 
 libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
-    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
-    benchmark_height_(72) {
+    benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(22),
+    benchmark_height_(14) {
     const char* repeat = getenv("LIBYUV_REPEAT");
     if (repeat) {
       benchmark_iterations_ = atoi(repeat);  // NOLINT
@@ -39,9 +39,14 @@ libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
     if (height) {
       benchmark_height_ = atoi(height);  // NOLINT
     }
-    benchmark_pixels_div256_ = static_cast<int>(
-        (static_cast<double>(benchmark_width_ *
-        benchmark_height_) * benchmark_iterations_  + 255.0) / 256.0);
+    benchmark_pixels_div256_ = static_cast<int>((
+        static_cast<double>(Abs(benchmark_width_)) *
+        static_cast<double>(Abs(benchmark_height_)) *
+        static_cast<double>(benchmark_iterations_)  + 255.0) / 256.0);
+    benchmark_pixels_div1280_ = static_cast<int>((
+        static_cast<double>(Abs(benchmark_width_)) *
+        static_cast<double>(Abs(benchmark_height_)) *
+        static_cast<double>(benchmark_iterations_)  + 1279.0) / 1280.0);
 }
 
 int main(int argc, char** argv) {
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.h b/chromium/third_party/libyuv/unit_test/unit_test.h
index e81aea30780..89b333bdd59 100644
--- a/chromium/third_party/libyuv/unit_test/unit_test.h
+++ b/chromium/third_party/libyuv/unit_test/unit_test.h
@@ -11,10 +11,21 @@
 #ifndef UNIT_TEST_UNIT_TEST_H_  // NOLINT
 #define UNIT_TEST_UNIT_TEST_H_
 
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
 #include <gtest/gtest.h>
 
 #include "libyuv/basic_types.h"
 
+static __inline int Abs(int v) {
+  return v >= 0 ? v : -v;
+}
+
 #define align_buffer_64(var, size)                                             \
   uint8* var;                                                                  \
   uint8* var##_mem;                                                            \
@@ -38,7 +49,6 @@
   var = 0;
 
 #ifdef WIN32
-#include <windows.h>
 static inline double get_time() {
   LARGE_INTEGER t, f;
   QueryPerformanceCounter(&t);
@@ -49,10 +59,6 @@ static inline double get_time() {
 #define random rand
 #define srandom srand
 #else
-
-#include <sys/time.h>
-#include <sys/resource.h>
-
 static inline double get_time() {
   struct timeval t;
   struct timezone tzp;
@@ -63,9 +69,9 @@ static inline double get_time() {
 
 static inline void MemRandomize(uint8* dst, int len) {
   int i;
-  for (i = 0; i < len - 3; i += 4) {
-    *reinterpret_cast<uint32*>(dst) = random();
-    dst += 4;
+  for (i = 0; i < len - 1; i += 2) {
+    *reinterpret_cast<uint16*>(dst) = random();
+    dst += 2;
   }
   for (; i < len; ++i) {
     *dst++ = random();
@@ -83,6 +89,7 @@ class libyuvTest : public ::testing::Test {
   int benchmark_width_;  // Default 1280.  Use 640 for benchmarking VGA.
   int benchmark_height_;  // Default 720.  Use 360 for benchmarking VGA.
   int benchmark_pixels_div256_;  // Total pixels to benchmark / 256.
+  int benchmark_pixels_div1280_;  // Total pixels to benchmark / 1280.
 };
 
 #endif  // UNIT_TEST_UNIT_TEST_H_  NOLINT
diff --git a/chromium/third_party/libyuv/util/convert.cc b/chromium/third_party/libyuv/util/convert.cc
index 18316ef8efb..5f071416da4 100644
--- a/chromium/third_party/libyuv/util/convert.cc
+++ b/chromium/third_party/libyuv/util/convert.cc
@@ -155,8 +155,8 @@ void ParseOptions(int argc, const char* argv[]) {
   }
 }
 
-static const int kTileX = 12;
-static const int kTileY = 8;
+static const int kTileX = 32;
+static const int kTileY = 32;
 
 static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
                          int src_width, int src_height,
diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c
index 8d8529ba7c6..db22871ea50 100644
--- a/chromium/third_party/libyuv/util/cpuid.c
+++ b/chromium/third_party/libyuv/util/cpuid.c
@@ -25,7 +25,7 @@ int main(int argc, const char* argv[]) {
 #if defined(__i386__) || defined(__x86_64__) || \
     defined(_M_IX86) || defined(_M_X64)
   if (has_x86) {
-    int family, model, cpu_info[4];
+    uint32 family, model, cpu_info[4];
     // Vendor ID:
     // AuthenticAMD AMD processor
     // CentaurHauls Centaur processor
@@ -37,7 +37,7 @@ int main(int argc, const char* argv[]) {
     // RiseRiseRise Rise Technology processor
     // SiS SiS SiS  SiS processor
     // UMC UMC UMC  UMC processor
-    CpuId(cpu_info, 0);
+    CpuId(0, 0, &cpu_info[0]);
     cpu_info[0] = cpu_info[1];  // Reorder output
     cpu_info[1] = cpu_info[3];
     cpu_info[3] = 0;
@@ -50,7 +50,7 @@ int main(int argc, const char* argv[]) {
     // 13:12 - Processor Type
     // 19:16 - Extended Model
     // 27:20 - Extended Family
-    CpuId(cpu_info, 1);
+    CpuId(1, 0, &cpu_info[0]);
     family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
     model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
     printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
@@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) {
     int has_avx = TestCpuFlag(kCpuHasAVX);
     int has_avx2 = TestCpuFlag(kCpuHasAVX2);
     int has_erms = TestCpuFlag(kCpuHasERMS);
+    int has_fma3 = TestCpuFlag(kCpuHasFMA3);
     printf("Has SSE2 %x\n", has_sse2);
     printf("Has SSSE3 %x\n", has_ssse3);
     printf("Has SSE4.1 %x\n", has_sse41);
@@ -86,6 +87,7 @@ int main(int argc, const char* argv[]) {
     printf("Has AVX %x\n", has_avx);
     printf("Has AVX2 %x\n", has_avx2);
     printf("Has ERMS %x\n", has_erms);
+    printf("Has FMA3 %x\n", has_fma3);
   }
   return 0;
 }
diff --git a/chromium/third_party/libyuv/util/psnr.h b/chromium/third_party/libyuv/util/psnr.h
index 2cd0b1457ce..370337a75f2 100644
--- a/chromium/third_party/libyuv/util/psnr.h
+++ b/chromium/third_party/libyuv/util/psnr.h
@@ -10,7 +10,7 @@
 
 // Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
 
-#ifndef UTIL_PSNR_H_
+#ifndef UTIL_PSNR_H_  // NOLINT
 #define UTIL_PSNR_H_
 
 #ifdef __cplusplus
@@ -36,4 +36,4 @@ double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
 }  // extern "C"
 #endif
 
-#endif  // UTIL_PSNR_H_
+#endif  // UTIL_PSNR_H_  // NOLINT
diff --git a/chromium/third_party/libyuv/util/ssim.cc b/chromium/third_party/libyuv/util/ssim.cc
index 277561dd00d..d07889a8ac8 100644
--- a/chromium/third_party/libyuv/util/ssim.cc
+++ b/chromium/third_party/libyuv/util/ssim.cc
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./ssim.h"
+#include "../util/ssim.h"  // NOLINT
 
 #include <math.h>
 #include <string.h>
diff --git a/chromium/third_party/libyuv/util/ssim.h b/chromium/third_party/libyuv/util/ssim.h
index 0689276addc..40120b4f4e7 100644
--- a/chromium/third_party/libyuv/util/ssim.h
+++ b/chromium/third_party/libyuv/util/ssim.h
@@ -10,7 +10,7 @@
 
 // Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
 
-#ifndef UTIL_SSIM_H_
+#ifndef UTIL_SSIM_H_  // NOLINT
 #define UTIL_SSIM_H_
 
 #ifdef __cplusplus
@@ -32,4 +32,4 @@ double CalcLSSIM(double ssim);
 }  // extern "C"
 #endif
 
-#endif  // UTIL_SSIM_H_
+#endif  // UTIL_SSIM_H_  // NOLINT
diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk
new file mode 100644
index 00000000000..2638608ebc9
--- /dev/null
+++ b/chromium/third_party/libyuv/winarm.mk
@@ -0,0 +1,43 @@
+# This is a generic makefile for libyuv for Windows Arm.
+# nmake /f winarm.mk
+# make -f winarm.mk
+# nmake /f winarm.mk clean
+# consider /arch:ARMv7VE
+CC=cl
+CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP
+AR=lib
+ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE
+RM=cmd /c del
+
+LOCAL_OBJ_FILES = \
+	source/compare.o\
+	source/compare_common.o\
+	source/convert.o\
+	source/convert_argb.o\
+	source/convert_from.o\
+	source/convert_from_argb.o\
+	source/convert_to_argb.o\
+	source/convert_to_i420.o\
+	source/cpu_id.o\
+	source/format_conversion.o\
+	source/planar_functions.o\
+	source/rotate.o\
+	source/rotate_argb.o\
+	source/row_any.o\
+	source/row_common.o\
+	source/scale.o\
+	source/scale_argb.o\
+	source/scale_common.o\
+	source/video_common.o
+
+.cc.o:
+	$(CC) /c $(CCFLAGS) $*.cc /Fo$@
+
+all: libyuv_arm.lib winarm.mk
+
+libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk
+	$(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES)
+
+clean:
+	$(RM) "source\*.o" libyuv_arm.lib
+
author	Andras Becsi <andras.becsi@digia.com>	2014-03-18 13:16:26 +0100
committer	Frederik Gladhorn <frederik.gladhorn@digia.com>	2014-03-20 15:55:39 +0100
commit	3f0f86b0caed75241fa71c95a5d73bc0164348c5 (patch)
tree	92b9fb00f2e9e90b0be2262093876d4f43b6cd13 /chromium/third_party/libyuv
parent	e90d7c4b152c56919d963987e2503f9909a666d2 (diff)
download	qtwebengine-chromium-3f0f86b0caed75241fa71c95a5d73bc0164348c5.tar.gz