summaryrefslogtreecommitdiff
path: root/chromium/third_party/libyuv
diff options
context:
space:
mode:
authorZeno Albisser <zeno.albisser@digia.com>2013-08-15 21:46:11 +0200
committerZeno Albisser <zeno.albisser@digia.com>2013-08-15 21:46:11 +0200
commit679147eead574d186ebf3069647b4c23e8ccace6 (patch)
treefc247a0ac8ff119f7c8550879ebb6d3dd8d1ff69 /chromium/third_party/libyuv
downloadqtwebengine-chromium-679147eead574d186ebf3069647b4c23e8ccace6.tar.gz
Initial import.
Diffstat (limited to 'chromium/third_party/libyuv')
-rw-r--r--chromium/third_party/libyuv/AUTHORS4
-rw-r--r--chromium/third_party/libyuv/Android.mk55
-rw-r--r--chromium/third_party/libyuv/DEPS107
-rw-r--r--chromium/third_party/libyuv/LICENSE29
-rw-r--r--chromium/third_party/libyuv/LICENSE_THIRD_PARTY8
-rw-r--r--chromium/third_party/libyuv/OWNERS3
-rw-r--r--chromium/third_party/libyuv/PATENTS24
-rw-r--r--chromium/third_party/libyuv/README.chromium9
-rw-r--r--chromium/third_party/libyuv/codereview.settings11
-rw-r--r--chromium/third_party/libyuv/include/libyuv.h32
-rw-r--r--chromium/third_party/libyuv/include/libyuv/basic_types.h109
-rw-r--r--chromium/third_party/libyuv/include/libyuv/compare.h73
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert.h254
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert_argb.h225
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert_from.h173
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert_from_argb.h168
-rw-r--r--chromium/third_party/libyuv/include/libyuv/cpu_id.h76
-rw-r--r--chromium/third_party/libyuv/include/libyuv/format_conversion.h168
-rw-r--r--chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h193
-rw-r--r--chromium/third_party/libyuv/include/libyuv/planar_functions.h367
-rw-r--r--chromium/third_party/libyuv/include/libyuv/rotate.h117
-rw-r--r--chromium/third_party/libyuv/include/libyuv/rotate_argb.h33
-rw-r--r--chromium/third_party/libyuv/include/libyuv/row.h1525
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale.h84
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale_argb.h43
-rw-r--r--chromium/third_party/libyuv/include/libyuv/version.h16
-rw-r--r--chromium/third_party/libyuv/include/libyuv/video_common.h179
-rw-r--r--chromium/third_party/libyuv/libyuv.gyp123
-rw-r--r--chromium/third_party/libyuv/libyuv_test.gyp126
-rw-r--r--chromium/third_party/libyuv/public.mk13
-rw-r--r--chromium/third_party/libyuv/source/compare.cc323
-rw-r--r--chromium/third_party/libyuv/source/compare_common.cc40
-rw-r--r--chromium/third_party/libyuv/source/compare_neon.cc61
-rw-r--r--chromium/third_party/libyuv/source/compare_posix.cc164
-rw-r--r--chromium/third_party/libyuv/source/compare_win.cc192
-rw-r--r--chromium/third_party/libyuv/source/convert.cc1568
-rw-r--r--chromium/third_party/libyuv/source/convert_argb.cc909
-rw-r--r--chromium/third_party/libyuv/source/convert_from.cc1278
-rw-r--r--chromium/third_party/libyuv/source/convert_from_argb.cc1100
-rw-r--r--chromium/third_party/libyuv/source/convert_jpeg.cc392
-rw-r--r--chromium/third_party/libyuv/source/convert_to_argb.cc324
-rw-r--r--chromium/third_party/libyuv/source/convert_to_i420.cc384
-rw-r--r--chromium/third_party/libyuv/source/cpu_id.cc252
-rw-r--r--chromium/third_party/libyuv/source/format_conversion.cc540
-rw-r--r--chromium/third_party/libyuv/source/mjpeg_decoder.cc540
-rw-r--r--chromium/third_party/libyuv/source/mjpeg_validate.cc41
-rw-r--r--chromium/third_party/libyuv/source/planar_functions.cc1993
-rw-r--r--chromium/third_party/libyuv/source/rotate.cc1283
-rw-r--r--chromium/third_party/libyuv/source/rotate_argb.cc213
-rw-r--r--chromium/third_party/libyuv/source/rotate_mips.cc486
-rw-r--r--chromium/third_party/libyuv/source/rotate_neon.cc405
-rw-r--r--chromium/third_party/libyuv/source/row_any.cc519
-rw-r--r--chromium/third_party/libyuv/source/row_common.cc1995
-rw-r--r--chromium/third_party/libyuv/source/row_mips.cc974
-rw-r--r--chromium/third_party/libyuv/source/row_neon.cc2741
-rw-r--r--chromium/third_party/libyuv/source/row_posix.cc5409
-rw-r--r--chromium/third_party/libyuv/source/row_win.cc6610
-rw-r--r--chromium/third_party/libyuv/source/row_x86.asm146
-rw-r--r--chromium/third_party/libyuv/source/scale.cc2455
-rw-r--r--chromium/third_party/libyuv/source/scale_argb.cc1202
-rw-r--r--chromium/third_party/libyuv/source/scale_argb_neon.cc142
-rw-r--r--chromium/third_party/libyuv/source/scale_mips.cc638
-rw-r--r--chromium/third_party/libyuv/source/scale_neon.cc554
-rw-r--r--chromium/third_party/libyuv/source/video_common.cc58
-rw-r--r--chromium/third_party/libyuv/source/x86inc.asm1136
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.bat79
-rwxr-xr-xchromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py118
-rwxr-xr-xchromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.sh138
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/OWNERS1
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/PRESUBMIT.py99
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions.txt5
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_mac.txt5
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_win32.txt5
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/OWNERS1
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/PRESUBMIT.py41
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions.txt5
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_mac.txt5
-rw-r--r--chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_win32.txt5
-rw-r--r--chromium/third_party/libyuv/unit_test/basictypes_test.cc60
-rw-r--r--chromium/third_party/libyuv/unit_test/compare_test.cc437
-rw-r--r--chromium/third_party/libyuv/unit_test/convert_test.cc990
-rw-r--r--chromium/third_party/libyuv/unit_test/cpu_test.cc108
-rw-r--r--chromium/third_party/libyuv/unit_test/planar_test.cc1528
-rw-r--r--chromium/third_party/libyuv/unit_test/rotate_argb_test.cc201
-rw-r--r--chromium/third_party/libyuv/unit_test/rotate_test.cc243
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_argb_test.cc276
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_test.cc197
-rw-r--r--chromium/third_party/libyuv/unit_test/testdata/arm_v7.txt12
-rw-r--r--chromium/third_party/libyuv/unit_test/testdata/tegra3.txt23
-rw-r--r--chromium/third_party/libyuv/unit_test/unit_test.cc50
-rw-r--r--chromium/third_party/libyuv/unit_test/unit_test.h88
-rw-r--r--chromium/third_party/libyuv/unit_test/version_test.cc42
-rw-r--r--chromium/third_party/libyuv/unit_test/video_common_test.cc108
-rw-r--r--chromium/third_party/libyuv/util/Makefile6
-rw-r--r--chromium/third_party/libyuv/util/compare.cc63
-rw-r--r--chromium/third_party/libyuv/util/convert.cc365
-rw-r--r--chromium/third_party/libyuv/util/cpuid.c92
-rw-r--r--chromium/third_party/libyuv/util/psnr.cc247
-rw-r--r--chromium/third_party/libyuv/util/psnr.h39
-rw-r--r--chromium/third_party/libyuv/util/psnr_main.cc561
-rw-r--r--chromium/third_party/libyuv/util/ssim.cc337
-rw-r--r--chromium/third_party/libyuv/util/ssim.h35
102 files changed, 48029 insertions, 0 deletions
diff --git a/chromium/third_party/libyuv/AUTHORS b/chromium/third_party/libyuv/AUTHORS
new file mode 100644
index 00000000000..9686ac13eb0
--- /dev/null
+++ b/chromium/third_party/libyuv/AUTHORS
@@ -0,0 +1,4 @@
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/chromium/third_party/libyuv/Android.mk b/chromium/third_party/libyuv/Android.mk
new file mode 100644
index 00000000000..513a1961b5c
--- /dev/null
+++ b/chromium/third_party/libyuv/Android.mk
@@ -0,0 +1,55 @@
+# This is the Android makefile for libyuv for both platform and NDK.
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_CPP_EXTENSION := .cc
+
+LOCAL_SRC_FILES := \
+ source/compare.cc \
+ source/compare_common.cc \
+ source/compare_posix.cc \
+ source/convert.cc \
+ source/convert_argb.cc \
+ source/convert_from.cc \
+ source/convert_from_argb.cc \
+ source/convert_to_argb.cc \
+ source/convert_to_i420.cc \
+ source/cpu_id.cc \
+ source/format_conversion.cc \
+ source/planar_functions.cc \
+ source/rotate.cc \
+ source/rotate_argb.cc \
+ source/rotate_mips.cc \
+ source/row_any.cc \
+ source/row_common.cc \
+ source/row_mips.cc \
+ source/row_posix.cc \
+ source/scale.cc \
+ source/scale_argb.cc \
+ source/scale_mips.cc \
+ source/video_common.cc
+
+# TODO(fbarchard): Enable mjpeg encoder.
+# source/mjpeg_decoder.cc
+# source/convert_jpeg.cc
+# source/mjpeg_validate.cc
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+ LOCAL_CFLAGS += -DLIBYUV_NEON
+ LOCAL_SRC_FILES += \
+ source/compare_neon.cc.neon \
+ source/rotate_neon.cc.neon \
+ source/row_neon.cc.neon \
+ source/scale_argb_neon.cc.neon \
+ source/scale_neon.cc.neon
+endif
+
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/include
+
+LOCAL_MODULE := libyuv_static
+LOCAL_MODULE_TAGS := optional
+
+include $(BUILD_STATIC_LIBRARY)
+
diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS
new file mode 100644
index 00000000000..eafc459c3f3
--- /dev/null
+++ b/chromium/third_party/libyuv/DEPS
@@ -0,0 +1,107 @@
+use_relative_paths = True
+
+vars = {
+ "libyuv_trunk" : "https://libyuv.googlecode.com/svn/trunk",
+
+ # Override root_dir in your .gclient's custom_vars to specify a custom root
+ # folder name.
+ "root_dir": "trunk",
+ "extra_gyp_flag": "-Dextra_gyp_flag=0",
+
+ # Use this googlecode_url variable only if there is an internal mirror for it.
+ # If you do not know, use the full path while defining your new deps entry.
+ "googlecode_url": "http://%s.googlecode.com/svn",
+ "chromium_trunk" : "http://src.chromium.org/svn/trunk",
+ # chrome://version/ for revision of canary Chrome.
+ "chromium_revision": "202548",
+}
+
+# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
+# https; the latter can cause problems for users behind proxies.
+deps = {
+ "../chromium_deps":
+ File(Var("chromium_trunk") + "/src/DEPS@" + Var("chromium_revision")),
+
+ "build":
+ Var("chromium_trunk") + "/src/build@" + Var("chromium_revision"),
+
+ # Needed by common.gypi.
+ "google_apis/build":
+ Var("chromium_trunk") + "/src/google_apis/build@" + Var("chromium_revision"),
+
+ "testing":
+ Var("chromium_trunk") + "/src/testing@" + Var("chromium_revision"),
+
+ "testing/gtest":
+ From("chromium_deps", "src/testing/gtest"),
+
+ "tools/clang":
+ Var("chromium_trunk") + "/src/tools/clang@" + Var("chromium_revision"),
+
+ "tools/gyp":
+ From("chromium_deps", "src/tools/gyp"),
+
+ "tools/python":
+ Var("chromium_trunk") + "/src/tools/python@" + Var("chromium_revision"),
+
+ "tools/valgrind":
+ Var("chromium_trunk") + "/src/tools/valgrind@" + Var("chromium_revision"),
+
+ # Needed by build/common.gypi.
+ "tools/win/supalink":
+ Var("chromium_trunk") + "/src/tools/win/supalink@" + Var("chromium_revision"),
+
+ "third_party/libjpeg_turbo":
+ From("chromium_deps", "src/third_party/libjpeg_turbo"),
+
+ # Yasm assember required for libjpeg_turbo
+ "third_party/yasm":
+ Var("chromium_trunk") + "/src/third_party/yasm@" + Var("chromium_revision"),
+
+ "third_party/yasm/source/patched-yasm":
+ Var("chromium_trunk") + "/deps/third_party/yasm/patched-yasm@" + Var("chromium_revision"),
+}
+
+deps_os = {
+ "win": {
+ # Use WebRTC's, stripped down, version of Cygwin (required by GYP).
+ "third_party/cygwin":
+ (Var("googlecode_url") % "webrtc") + "/deps/third_party/cygwin@2672",
+
+ # Used by libjpeg-turbo.
+ # TODO(fbarchard): Remove binaries and run yasm from build folder.
+ "third_party/yasm/binaries":
+ Var("chromium_trunk") + "/deps/third_party/yasm/binaries@" + Var("chromium_revision"),
+ "third_party/yasm": None,
+ },
+ "unix": {
+ "third_party/gold":
+ From("chromium_deps", "src/third_party/gold"),
+ },
+}
+
+hooks = [
+ {
+ # Pull clang on mac. If nothing changed, or on non-mac platforms, this takes
+ # zero seconds to run. If something changed, it downloads a prebuilt clang.
+ "pattern": ".",
+ "action": ["python", Var("root_dir") + "/tools/clang/scripts/update.py",
+ "--mac-only"],
+ },
+ {
+ # A change to a .gyp, .gypi, or to GYP itself should run the generator.
+ "pattern": ".",
+ "action": ["python", Var("root_dir") + "/build/gyp_chromium",
+ "--depth=" + Var("root_dir"), Var("root_dir") + "/libyuv_test.gyp",
+ Var("extra_gyp_flag")],
+ },
+ {
+ # Update the cygwin mount on Windows.
+ # This is necessary to get the correct mapping between e.g. /bin and the
+ # cygwin path on Windows. Without it we can't run bash scripts in actions.
+ # Ideally this should be solved in "pylib/gyp/msvs_emulation.py".
+ "pattern": ".",
+ "action": ["python", Var("root_dir") + "/build/win/setup_cygwin_mount.py",
+ "--win-only"],
+ },
+]
diff --git a/chromium/third_party/libyuv/LICENSE b/chromium/third_party/libyuv/LICENSE
new file mode 100644
index 00000000000..c911747a6b5
--- /dev/null
+++ b/chromium/third_party/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ * Neither the name of Google nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/chromium/third_party/libyuv/LICENSE_THIRD_PARTY b/chromium/third_party/libyuv/LICENSE_THIRD_PARTY
new file mode 100644
index 00000000000..a71591e7710
--- /dev/null
+++ b/chromium/third_party/libyuv/LICENSE_THIRD_PARTY
@@ -0,0 +1,8 @@
+This source tree contains third party source code which is governed by third
+party licenses. This file contains references to files which are under other
+licenses than the one provided in the LICENSE file in the root of the source
+tree.
+
+Files governed by third party licenses:
+source/x86inc.asm
+
diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS
new file mode 100644
index 00000000000..cbe985ecfdd
--- /dev/null
+++ b/chromium/third_party/libyuv/OWNERS
@@ -0,0 +1,3 @@
+fbarchard@chromium.org
+mflodman@chromium.org
+
diff --git a/chromium/third_party/libyuv/PATENTS b/chromium/third_party/libyuv/PATENTS
new file mode 100644
index 00000000000..64aa5c90d8f
--- /dev/null
+++ b/chromium/third_party/libyuv/PATENTS
@@ -0,0 +1,24 @@
+Additional IP Rights Grant (Patents)
+
+"This implementation" means the copyrightable works distributed by
+Google as part of the LibYuv code package.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive,
+no-charge, irrevocable (except as stated in this section) patent
+license to make, have made, use, offer to sell, sell, import,
+transfer, and otherwise run, modify and propagate the contents of this
+implementation of the LibYuv code package, where such license applies
+only to those patent claims, both currently owned by Google and
+acquired in the future, licensable by Google that are necessarily
+infringed by this implementation of the LibYuv code package. This
+grant does not include claims that would be infringed only as a
+consequence of further modification of this implementation. If you or
+your agent or exclusive licensee institute or order or agree to the
+institution of patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that this
+implementation of the LibYuv code package or any code incorporated
+within this implementation of the LibYuv code package constitutes
+direct or contributory patent infringement, or inducement of patent
+infringement, then any patent rights granted to you under this License
+for this implementation of the LibYuv code package shall terminate as
+of the date such litigation is filed. \ No newline at end of file
diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium
new file mode 100644
index 00000000000..edc5d82ba88
--- /dev/null
+++ b/chromium/third_party/libyuv/README.chromium
@@ -0,0 +1,9 @@
+Name: libyuv
+URL: http://code.google.com/p/libyuv/
+Version: 723
+License: BSD
+License File: LICENSE
+
+Description:
+libyuv is an open source project that includes
+YUV conversion and scaling functionality.
diff --git a/chromium/third_party/libyuv/codereview.settings b/chromium/third_party/libyuv/codereview.settings
new file mode 100644
index 00000000000..f57dc1a888a
--- /dev/null
+++ b/chromium/third_party/libyuv/codereview.settings
@@ -0,0 +1,11 @@
+# This file is used by gcl to get repository specific information.
+# The LibYuv code review is via WebRtc's code review
+CODE_REVIEW_SERVER: webrtc-codereview.appspot.com
+#CC_LIST:
+VIEW_VC: https://code.google.com/p/libyuv/source/detail?r=
+#STATUS:
+TRY_ON_UPLOAD: False
+TRYSERVER_ROOT: src
+TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-libyuv
+#GITCL_PREUPLOAD:
+#GITCL_PREDCOMMIT:
diff --git a/chromium/third_party/libyuv/include/libyuv.h b/chromium/third_party/libyuv/include/libyuv.h
new file mode 100644
index 00000000000..c058665a889
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_H_ // NOLINT
+#define INCLUDE_LIBYUV_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/mjpeg_decoder.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_argb.h"
+#include "libyuv/version.h"
+#include "libyuv/video_common.h"
+
+#endif // INCLUDE_LIBYUV_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/basic_types.h b/chromium/third_party/libyuv/include/libyuv/basic_types.h
new file mode 100644
index 00000000000..51b15f8763d
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/basic_types.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ // NOLINT
+#define INCLUDE_LIBYUV_BASIC_TYPES_H_
+
+#include <stddef.h> // for NULL, size_t
+
+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))
+#include <sys/types.h> // for uintptr_t on x86
+#else
+#include <stdint.h> // for uintptr_t
+#endif
+
+#ifndef GG_LONGLONG
+#ifndef INT_TYPES_DEFINED
+#define INT_TYPES_DEFINED
+#ifdef COMPILER_MSVC
+typedef unsigned __int64 uint64;
+typedef __int64 int64;
+#ifndef INT64_C
+#define INT64_C(x) x ## I64
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UI64
+#endif
+#define INT64_F "I64"
+#else // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64; // NOLINT
+typedef long int64; // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## L
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## UL
+#endif
+#define INT64_F "l"
+#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64; // NOLINT
+typedef long long int64; // NOLINT
+#ifndef INT64_C
+#define INT64_C(x) x ## LL
+#endif
+#ifndef UINT64_C
+#define UINT64_C(x) x ## ULL
+#endif
+#define INT64_F "ll"
+#endif // __LP64__
+#endif // COMPILER_MSVC
+typedef unsigned int uint32;
+typedef int int32;
+typedef unsigned short uint16; // NOLINT
+typedef short int16; // NOLINT
+typedef unsigned char uint8;
+typedef signed char int8;
+#endif // INT_TYPES_DEFINED
+#endif // GG_LONGLONG
+
+// Detect compiler is for x86 or x64.
+#if defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86)
+#define CPU_X86 1
+#endif
+// Detect compiler is for ARM.
+#if defined(__arm__) || defined(_M_ARM)
+#define CPU_ARM 1
+#endif
+
+#ifndef ALIGNP
+#define ALIGNP(p, t) \
+ (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
+ ((t) - 1)) & ~((t) - 1))))
+#endif
+
+#if !defined(LIBYUV_API)
+#if defined(_WIN32) || defined(__CYGWIN__)
+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllexport)
+#elif defined(LIBYUV_USING_SHARED_LIBRARY)
+#define LIBYUV_API __declspec(dllimport)
+#else
+#define LIBYUV_API
+#endif // LIBYUV_BUILDING_SHARED_LIBRARY
+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
+ (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
+ defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__ ((visibility ("default")))
+#else
+#define LIBYUV_API
+#endif // __GNUC__
+#endif // LIBYUV_API
+
+// Visual C x86 or GCC little endian.
+#if defined(__x86_64__) || defined(_M_X64) || \
+ defined(__i386__) || defined(_M_IX86) || \
+ defined(__arm__) || defined(_M_ARM) || \
+ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define LIBYUV_LITTLE_ENDIAN
+#endif
+
+#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/compare.h b/chromium/third_party/libyuv/include/libyuv/compare.h
new file mode 100644
index 00000000000..5dfac7c86aa
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/compare.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+ const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/convert.h b/chromium/third_party/libyuv/include/libyuv/convert.h
new file mode 100644
index 00000000000..dd71cf5bb2e
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/convert.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert Q420 to I420.
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height,
+ int dst_width, int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+ int* width, int* height);
+#endif
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "dst_width" / "dst_height" is size of destination to crop to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 00000000000..3f029dc417c
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// Add missing Q420.
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I400 (grey) to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB_Reference
+
+// Convert I400 to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// TODO(fbarchard): Convert Q420 to ARGB.
+// LIBYUV_API
+// int Q420ToARGB(const uint8* src_y, int src_stride_y,
+// const uint8* src_yuy2, int src_stride_yuy2,
+// uint8* dst_argb, int dst_stride_argb,
+// int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+ uint8* dst_argb, int dst_stride_argb,
+ int src_width, int src_height,
+ int dst_width, int dst_height);
+#endif
+
+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "dst_width" / "dst_height" is size of destination to crop to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+ uint8* dst_argb, int dst_stride_argb,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from.h b/chromium/third_party/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 00000000000..b1cf57f7dc0
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// TODO(fbarchard): I420ToM420
+// TODO(fbarchard): I420ToQ420
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+ const uint8* u, int u_stride,
+ const uint8* v, int v_stride,
+ uint8* dst_sample, int dst_sample_stride,
+ int width, int height,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 00000000000..be3bba44433
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To BGRA. (alias)
+#define ARGBToBGRA BGRAToARGB
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To ABGR. (alias)
+#define ARGBToABGR ABGRToARGB
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb, int dst_stride_rgb,
+ int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
new file mode 100644
index 00000000000..8b6d043222b
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT
+#define INCLUDE_LIBYUV_CPU_ID_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Internal flag to indicate cpuid requires initialization.
+static const int kCpuInit = 0x1;
+
+// These flags are only valid on ARM processors.
+static const int kCpuHasARM = 0x2;
+static const int kCpuHasNEON = 0x4;
+// 0x8 reserved for future ARM flag.
+
+// These flags are only valid on x86 processors.
+static const int kCpuHasX86 = 0x10;
+static const int kCpuHasSSE2 = 0x20;
+static const int kCpuHasSSSE3 = 0x40;
+static const int kCpuHasSSE41 = 0x80;
+static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasAVX = 0x200;
+static const int kCpuHasAVX2 = 0x400;
+static const int kCpuHasERMS = 0x800;
+
+// These flags are only valid on MIPS processors.
+static const int kCpuHasMIPS = 0x1000;
+static const int kCpuHasMIPS_DSP = 0x2000;
+static const int kCpuHasMIPS_DSPR2 = 0x4000;
+
+// Internal function used to auto-init.
+LIBYUV_API
+int InitCpuFlags(void);
+
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+
+// Detect CPU has SSE2 etc.
+// Test_flag parameter should be one of kCpuHas constants above.
+// returns non-zero if instruction set is detected
+static __inline int TestCpuFlag(int test_flag) {
+ LIBYUV_API extern int cpu_info_;
+ return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
+}
+
+// For testing, allow CPU flags to be disabled.
+// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
+// MaskCpuFlags(-1) to enable all cpu specific optimizations.
+// MaskCpuFlags(0) to disable all cpu specific optimizations.
+LIBYUV_API
+void MaskCpuFlags(int enable_flags);
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+LIBYUV_API
+void CpuId(int cpu_info[4], int info_type);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CPU_ID_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/format_conversion.h b/chromium/third_party/libyuv/include/libyuv/format_conversion.h
new file mode 100644
index 00000000000..b18bf053438
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/format_conversion.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT
+#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert Bayer RGB formats to I420.
+LIBYUV_API
+int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
+ BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
+
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ uint32 src_fourcc_bayer);
+
+// Convert I420 to Bayer RGB formats.
+LIBYUV_API
+int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Temporary API mapper.
+#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
+ I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
+
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height,
+ uint32 dst_fourcc_bayer);
+
+// Convert Bayer RGB formats to ARGB.
+LIBYUV_API
+int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ uint32 src_fourcc_bayer);
+
+// Converts ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+// Temporary API mapper.
+#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 00000000000..e53c1fe1e2e
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#ifdef __cplusplus
+
+#include "libyuv/basic_types.h"
+
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+bool ValidateJpeg(const uint8* sample, size_t sample_size);
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+ kJpegYuv420,
+ kJpegYuv422,
+ kJpegYuv411,
+ kJpegYuv444,
+ kJpegYuv400,
+ kJpegUnknown
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class MJpegDecoder {
+ public:
+ typedef void (*CallbackFunction)(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows);
+
+ static const int kColorSpaceUnknown;
+ static const int kColorSpaceGrayscale;
+ static const int kColorSpaceRgb;
+ static const int kColorSpaceYCbCr;
+ static const int kColorSpaceCMYK;
+ static const int kColorSpaceYCCK;
+
+ MJpegDecoder();
+ ~MJpegDecoder();
+
+ // Loads a new frame, reads its headers, and determines the uncompressed
+ // image format. Returns true if image looks valid and format is supported.
+ // If return value is true, then the values for all the following getters
+ // are populated.
+ // src_len is the size of the compressed mjpeg frame in bytes.
+ bool LoadFrame(const uint8* src, size_t src_len);
+
+ // Returns width of the last loaded frame in pixels.
+ int GetWidth();
+
+ // Returns height of the last loaded frame in pixels.
+ int GetHeight();
+
+ // Returns format of the last loaded frame. The return value is one of the
+ // kColorSpace* constants.
+ int GetColorSpace();
+
+ // Number of color components in the color space.
+ int GetNumComponents();
+
+ // Sample factors of the n-th component.
+ int GetHorizSampFactor(int component);
+
+ int GetVertSampFactor(int component);
+
+ int GetHorizSubSampFactor(int component);
+
+ int GetVertSubSampFactor(int component);
+
+ // Public for testability.
+ int GetImageScanlinesPerImcuRow();
+
+ // Public for testability.
+ int GetComponentScanlinesPerImcuRow(int component);
+
+ // Width of a component in bytes.
+ int GetComponentWidth(int component);
+
+ // Height of a component.
+ int GetComponentHeight(int component);
+
+ // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+ int GetComponentStride(int component);
+
+ // Size of a component in bytes.
+ int GetComponentSize(int component);
+
+ // Call this after LoadFrame() if you decide you don't want to decode it
+ // after all.
+ bool UnloadFrame();
+
+ // Decodes the entire image into a one-buffer-per-color-component format.
+ // dst_width must match exactly. dst_height must be <= to image height; if
+ // less, the image is cropped. "planes" must have size equal to at least
+ // GetNumComponents() and they must point to non-overlapping buffers of size
+ // at least GetComponentSize(i). The pointers in planes are incremented
+ // to point to after the end of the written data.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ bool DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+ // Decodes the entire image and passes the data via repeated calls to a
+ // callback function. Each call will get the data for a whole number of
+ // image scanlines.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ bool DecodeToCallback(CallbackFunction fn, void* opaque,
+ int dst_width, int dst_height);
+
+ // The helper function which recognizes the jpeg sub-sampling type.
+ static JpegSubsamplingType JpegSubsamplingTypeHelper(
+ int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+ struct Buffer {
+ const uint8* data;
+ int len;
+ };
+
+ struct BufferVector {
+ Buffer* buffers;
+ int len;
+ int pos;
+ };
+
+ // Methods that are passed to jpeglib.
+ static int fill_input_buffer(jpeg_decompress_struct* cinfo);
+ static void init_source(jpeg_decompress_struct* cinfo);
+ static void skip_input_data(jpeg_decompress_struct* cinfo,
+ long num_bytes); // NOLINT
+ static void term_source(jpeg_decompress_struct* cinfo);
+
+ static void ErrorHandler(jpeg_common_struct* cinfo);
+
+ void AllocOutputBuffers(int num_outbufs);
+ void DestroyOutputBuffers();
+
+ bool StartDecode();
+ bool FinishDecode();
+
+ void SetScanlinePointers(uint8** data);
+ bool DecodeImcuRow();
+
+ int GetComponentScanlinePadding(int component);
+
+ // A buffer holding the input data for a frame.
+ Buffer buf_;
+ BufferVector buf_vec_;
+
+ jpeg_decompress_struct* decompress_struct_;
+ jpeg_source_mgr* source_mgr_;
+ SetJmpErrorMgr* error_mgr_;
+
+ // true iff at least one component has scanline padding. (i.e.,
+ // GetComponentScanlinePadding() != 0.)
+ bool has_scanline_padding_;
+
+ // Temporaries used to point to scanline outputs.
+ int num_outbufs_; // Outermost size of all arrays below.
+ uint8*** scanlines_;
+ int* scanlines_sizes_;
+ // Temporary buffer used for decoding when we can't decode directly to the
+ // output buffers. Large enough for just one iMCU row.
+ uint8** databuf_;
+ int* databuf_strides_;
+};
+
+} // namespace libyuv
+
+#endif // __cplusplus
+#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
new file mode 100644
index 00000000000..cb14678a8b3
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
@@ -0,0 +1,367 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT
+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
+
+#include "libyuv/basic_types.h"
+
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data.
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Set a plane of data to a 32 bit value.
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+ int width, int height,
+ uint32 value);
+
+// Copy I400. Supports inverting.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+
+// Copy I422 to I422.
+#define I422ToI422 I422Copy
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Copy I444 to I444.
+#define I444ToI444 I444Copy
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert UYVY to I422.
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I420 to I400. (calls CopyPlane ignoring u/v).
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Alias
+#define I420ToI420Mirror I420Mirror
+
+// I420 mirror.
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Alias
+#define I400ToI400Mirror I400Mirror
+
+// I400 mirror. A single plane is mirrored horizontally.
+// Pass negative height to achieve 180 degree rotation.
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Alias
+#define ARGBToARGBMirror ARGBMirror
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// I422ToARGB is in convert_argb.h
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height);
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height);
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height);
+
+// Draw a rectangle into I420.
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int x, int y, int width, int height,
+ int value_y, int value_u, int value_v);
+
+// Draw a rectangle into ARGB.
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height, uint32 value);
+
+// Convert ARGB to gray scale ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height);
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+ int x, int y, int width, int height);
+
+// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The last 4 coefficients apply to B, G, R, A and produce R of the output.
+LIBYUV_API
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_argb,
+ int x, int y, int width, int height);
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int x, int y, int width, int height);
+
+// Quantize a rectangle of ARGB. Alpha unaffected.
+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
+// interval_size should be a value between 1 and 255.
+// interval_offset should be a value between 0 and 255.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+ int scale, int interval_size, int interval_offset,
+ int x, int y, int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend();
+
+// Alpha Blend ARGB images and store to destination.
+// Alpha of destination is set to 255.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Add ARGB image with ARGB image. Saturates to 255.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I422 to YUY2.
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Convert I422 to UYVY.
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert MJPG to ARGB.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h, int dw, int dh);
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height);
+
+// Blur ARGB image.
+// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
+// to 16 byte boundary.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height, int radius);
+
+// Multiply ARGB image by ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, uint32 value);
+
+// Interpolate between two ARGB images using specified amount of interpolation
+// (0 to 255) and store to destination.
+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0
+// and 255 means 1% src_argb0 and 99% src_argb1.
+// Internally uses ARGBScale bilinear filtering.
+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int interpolation);
+
+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+ defined(TARGET_IPHONE_SIMULATOR)
+#define LIBYUV_DISABLE_X86
+#endif
+// Row functions for copying a pixels from a source with a slope to a row
+// of destination. Useful for scaling, rotation, mirror, texture mapping.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+#define HAS_ARGBAFFINEROW_SSE2
+#endif // LIBYUV_DISABLE_X86
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+// shuffler is 16 bytes and must be aligned.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* shuffler, int width, int height);
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/rotate.h b/chromium/third_party/libyuv/include/libyuv/rotate.h
new file mode 100644
index 00000000000..20b7c04c87a
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+enum RotationMode {
+ kRotate0 = 0, // No rotation.
+ kRotate90 = 90, // Rotate 90 degrees clockwise.
+ kRotate180 = 180, // Rotate 180 degrees.
+ kRotate270 = 270, // Rotate 270 degrees clockwise.
+
+ // Deprecated.
+ kRotateNone = 0,
+ kRotateClockwise = 90,
+ kRotateCounterClockwise = 270,
+};
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height, enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height, enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/rotate_argb.h b/chromium/third_party/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 00000000000..660ff5573ec
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h" // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int src_width, int src_height, enum RotationMode mode);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h
new file mode 100644
index 00000000000..3416661742f
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/row.h
@@ -0,0 +1,1525 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROW_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Remove kMaxStride.
+#ifdef __arm__
+#define kMaxStride (1920 * 4)
+#else
+#define kMaxStride (4096 * 4)
+#endif
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
+
+#if defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
+ defined(TARGET_IPHONE_SIMULATOR)
+#define LIBYUV_DISABLE_X86
+#endif
+// True if compiling for SSSE3 as a requirement.
+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
+#define LIBYUV_SSSE3_ONLY
+#endif
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+// Conversions.
+#define HAS_ABGRTOUVROW_SSSE3
+#define HAS_ABGRTOYROW_SSSE3
+#define HAS_ARGB1555TOARGBROW_SSE2
+#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSSE3
+#define HAS_ARGBTOARGB1555ROW_SSE2
+#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTOBAYERROW_SSSE3
+#define HAS_ARGBTORAWROW_SSSE3
+#define HAS_ARGBTORGB24ROW_SSSE3
+#define HAS_ARGBTORGB565ROW_SSE2
+#define HAS_ARGBTOUV422ROW_SSSE3
+#define HAS_ARGBTOUV444ROW_SSSE3
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBTOUVJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
+#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_BGRATOUVROW_SSSE3
+#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_SSE2
+#define HAS_COPYROW_X86
+#define HAS_COPYROW_ERMS
+#define HAS_HALFROW_SSE2
+#define HAS_I400TOARGBROW_SSE2
+#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOABGRROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
+#define HAS_I422TOARGB4444ROW_SSSE3
+#define HAS_I422TOARGBROW_SSSE3
+#define HAS_I422TOBGRAROW_SSSE3
+#define HAS_I422TORAWROW_SSSE3
+#define HAS_I422TORGB24ROW_SSSE3
+#define HAS_I422TORGB565ROW_SSSE3
+#define HAS_I422TORGBAROW_SSSE3
+#define HAS_I422TOUYVYROW_SSE2
+#define HAS_I422TOYUY2ROW_SSE2
+#define HAS_I444TOARGBROW_SSSE3
+#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORUVROW_SSSE3
+#define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB565ROW_SSSE3
+#define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB565ROW_SSSE3
+#define HAS_RAWTOARGBROW_SSSE3
+#define HAS_RAWTOYROW_SSSE3
+#define HAS_RGB24TOARGBROW_SSSE3
+#define HAS_RGB24TOYROW_SSSE3
+#define HAS_RGB565TOARGBROW_SSE2
+#define HAS_RGBATOUVROW_SSSE3
+#define HAS_RGBATOYROW_SSSE3
+#define HAS_SETROW_X86
+#define HAS_SPLITUVROW_SSE2
+#define HAS_UYVYTOARGBROW_SSSE3
+#define HAS_UYVYTOUV422ROW_SSE2
+#define HAS_UYVYTOUVROW_SSE2
+#define HAS_UYVYTOYROW_SSE2
+#define HAS_YTOARGBROW_SSE2
+#define HAS_YUY2TOARGBROW_SSSE3
+#define HAS_YUY2TOUV422ROW_SSE2
+#define HAS_YUY2TOUVROW_SSE2
+#define HAS_YUY2TOYROW_SSE2
+
+// Effects
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELXROW_SSSE3
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSSE3
+#endif
+
+// The following are Windows only.
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_ARGBCOLORTABLEROW_X86
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBTOUVROW_AVX2
+#define HAS_ARGBTOYJROW_AVX2
+#define HAS_ARGBTOYROW_AVX2
+#define HAS_HALFROW_AVX2
+#define HAS_I422TOARGBROW_AVX2
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MIRRORROW_AVX2
+#define HAS_SPLITUVROW_AVX2
+#define HAS_UYVYTOUV422ROW_AVX2
+#define HAS_UYVYTOUVROW_AVX2
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_YUY2TOUV422ROW_AVX2
+#define HAS_YUY2TOUVROW_AVX2
+#define HAS_YUY2TOYROW_AVX2
+
+// Effects
+#define HAS_ARGBADDROW_AVX2
+#define HAS_ARGBATTENUATEROW_AVX2
+#define HAS_ARGBMIRRORROW_AVX2
+#define HAS_ARGBMULTIPLYROW_AVX2
+#define HAS_ARGBSUBTRACTROW_AVX2
+#define HAS_ARGBUNATTENUATEROW_AVX2
+#endif
+#endif
+
+// The following are Yasm x86 only.
+// TODO(fbarchard): Port AVX2 to inline.
+#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
+ (defined(_M_IX86) || defined(_M_X64) || \
+ defined(__x86_64__) || defined(__i386__))
+#define HAS_MERGEUVROW_AVX2
+#define HAS_MERGEUVROW_MMX
+#define HAS_SPLITUVROW_AVX2
+#define HAS_SPLITUVROW_MMX
+#define HAS_UYVYTOYROW_AVX2
+#define HAS_UYVYTOYROW_MMX
+#define HAS_YUY2TOYROW_AVX2
+#define HAS_YUY2TOYROW_MMX
+#endif
+
+// The following are disabled when SSSE3 is available:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
+ !defined(LIBYUV_SSSE3_ONLY)
+#define HAS_ARGBATTENUATEROW_SSE2
+#define HAS_ARGBBLENDROW_SSE2
+#define HAS_MIRRORROW_SSE2
+#endif
+
+// The following are available on Neon platforms
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_ABGRTOUVROW_NEON
+#define HAS_ABGRTOYROW_NEON
+#define HAS_ARGB1555TOARGBROW_NEON
+#define HAS_ARGB1555TOUVROW_NEON
+#define HAS_ARGB1555TOYROW_NEON
+#define HAS_ARGB4444TOARGBROW_NEON
+#define HAS_ARGB4444TOUVROW_NEON
+#define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBTOARGB1555ROW_NEON
+#define HAS_ARGBTOARGB4444ROW_NEON
+#define HAS_ARGBTOBAYERROW_NEON
+#define HAS_ARGBTOBAYERGGROW_NEON
+#define HAS_ARGBTORAWROW_NEON
+#define HAS_ARGBTORGB24ROW_NEON
+#define HAS_ARGBTORGB565ROW_NEON
+#define HAS_ARGBTOUV411ROW_NEON
+#define HAS_ARGBTOUV422ROW_NEON
+#define HAS_ARGBTOUV444ROW_NEON
+#define HAS_ARGBTOUVROW_NEON
+#define HAS_ARGBTOUVJROW_NEON
+#define HAS_ARGBTOYROW_NEON
+#define HAS_ARGBTOYJROW_NEON
+#define HAS_BGRATOUVROW_NEON
+#define HAS_BGRATOYROW_NEON
+#define HAS_COPYROW_NEON
+#define HAS_HALFROW_NEON
+#define HAS_I400TOARGBROW_NEON
+#define HAS_I411TOARGBROW_NEON
+#define HAS_I422TOABGRROW_NEON
+#define HAS_I422TOARGB1555ROW_NEON
+#define HAS_I422TOARGB4444ROW_NEON
+#define HAS_I422TOARGBROW_NEON
+#define HAS_I422TOBGRAROW_NEON
+#define HAS_I422TORAWROW_NEON
+#define HAS_I422TORGB24ROW_NEON
+#define HAS_I422TORGB565ROW_NEON
+#define HAS_I422TORGBAROW_NEON
+#define HAS_I422TOUYVYROW_NEON
+#define HAS_I422TOYUY2ROW_NEON
+#define HAS_I444TOARGBROW_NEON
+#define HAS_MERGEUVROW_NEON
+#define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
+#define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB565ROW_NEON
+#define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB565ROW_NEON
+#define HAS_RAWTOARGBROW_NEON
+#define HAS_RAWTOUVROW_NEON
+#define HAS_RAWTOYROW_NEON
+#define HAS_RGB24TOARGBROW_NEON
+#define HAS_RGB24TOUVROW_NEON
+#define HAS_RGB24TOYROW_NEON
+#define HAS_RGB565TOARGBROW_NEON
+#define HAS_RGB565TOUVROW_NEON
+#define HAS_RGB565TOYROW_NEON
+#define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYROW_NEON
+#define HAS_SETROW_NEON
+#define HAS_SPLITUVROW_NEON
+#define HAS_UYVYTOARGBROW_NEON
+#define HAS_UYVYTOUV422ROW_NEON
+#define HAS_UYVYTOUVROW_NEON
+#define HAS_UYVYTOYROW_NEON
+#define HAS_YTOARGBROW_NEON
+#define HAS_YUY2TOARGBROW_NEON
+#define HAS_YUY2TOUV422ROW_NEON
+#define HAS_YUY2TOUVROW_NEON
+#define HAS_YUY2TOYROW_NEON
+
+// Effects
+#define HAS_ARGBADDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
+#define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBCOLORMATRIXROW_NEON
+#define HAS_ARGBGRAYROW_NEON
+#define HAS_ARGBMIRRORROW_NEON
+#define HAS_ARGBMULTIPLYROW_NEON
+#define HAS_ARGBQUANTIZEROW_NEON
+#define HAS_ARGBSEPIAROW_NEON
+#define HAS_ARGBSHADEROW_NEON
+#define HAS_ARGBSUBTRACTROW_NEON
+#define HAS_SOBELROW_NEON
+#define HAS_SOBELXYROW_NEON
+#define HAS_SOBELXROW_NEON
+#define HAS_SOBELYROW_NEON
+#define HAS_INTERPOLATEROW_NEON
+#endif
+
+// The following are available on Mips platforms
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#define HAS_COPYROW_MIPS
+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_I422TOABGRROW_MIPS_DSPR2
+#define HAS_I422TOARGBROW_MIPS_DSPR2
+#define HAS_I422TOBGRAROW_MIPS_DSPR2
+#define HAS_INTERPOLATEROWS_MIPS_DSPR2
+#define HAS_MIRRORROW_MIPS_DSPR2
+#define HAS_MIRRORUVROW_MIPS_DSPR2
+#define HAS_SPLITUVROW_MIPS_DSPR2
+#endif
+#endif
+
+#if defined(_MSC_VER) && !defined(__CLR_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+typedef __declspec(align(16)) int16 vec16[8];
+typedef __declspec(align(16)) int32 vec32[4];
+typedef __declspec(align(16)) int8 vec8[16];
+typedef __declspec(align(16)) uint16 uvec16[8];
+typedef __declspec(align(16)) uint32 uvec32[4];
+typedef __declspec(align(16)) uint8 uvec8[16];
+typedef __declspec(align(32)) int16 lvec16[16];
+typedef __declspec(align(32)) int32 lvec32[8];
+typedef __declspec(align(32)) int8 lvec8[32];
+typedef __declspec(align(32)) uint16 ulvec16[16];
+typedef __declspec(align(32)) uint32 ulvec32[8];
+typedef __declspec(align(32)) uint8 ulvec8[32];
+
+#elif defined(__GNUC__)
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+typedef int16 __attribute__((vector_size(16))) vec16;
+typedef int32 __attribute__((vector_size(16))) vec32;
+typedef int8 __attribute__((vector_size(16))) vec8;
+typedef uint16 __attribute__((vector_size(16))) uvec16;
+typedef uint32 __attribute__((vector_size(16))) uvec32;
+typedef uint8 __attribute__((vector_size(16))) uvec8;
+#else
+#define SIMD_ALIGNED(var) var
+typedef int16 vec16[8];
+typedef int32 vec32[4];
+typedef int8 vec8[16];
+typedef uint16 uvec16[8];
+typedef uint32 uvec32[4];
+typedef uint8 uvec8[16];
+#endif
+
+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
+#define OMITFP
+#else
+#define OMITFP __attribute__((optimize("omit-frame-pointer")))
+#endif
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width);
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width);
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width);
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width);
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width);
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_rgb565,
+ int width);
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);
+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);
+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);
+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);
+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);
+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);
+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);
+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);
+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
+
+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix);
+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix);
+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int pix);
+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
+ int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
+ int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int pix);
+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width);
+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width);
+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int width);
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV422Row_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+void ARGBToUV411Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width);
+
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
+void MirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width);
+
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+ uint8* dst_v, int pix);
+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix);
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+ uint8* dst_uv, int width);
+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width);
+
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
+void CopyRow_X86(const uint8* src, uint8* dst, int count);
+void CopyRow_NEON(const uint8* src, uint8* dst, int count);
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
+void CopyRow_C(const uint8* src, uint8* dst, int count);
+
+void SetRow_X86(uint8* dst, uint32 v32, int count);
+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height);
+void SetRow_NEON(uint8* dst, uint32 v32, int count);
+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height);
+void SetRow_C(uint8* dst, uint32 v32, int count);
+void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
+ int height);
+
+// ARGBShufflers for BGRAToARGB etc.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);
+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);
+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+ int pix);
+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);
+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);
+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+ int pix);
+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int pix);
+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int pix);
+
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);
+
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToRGB24Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width);
+void I422ToRAWRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
+void I422ToARGB4444Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width);
+void I422ToARGB1555Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width);
+void I422ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width);
+void YToARGBRow_C(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I444ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+// RGB24/RAW are unaligned.
+void I422ToRGB24Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width);
+void I422ToRAWRow_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width);
+
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGBRow_Any_AVX2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width);
+void I422ToABGRRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width);
+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width);
+// RGB24/RAW are unaligned.
+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRAWRow_Any_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_SSE2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_Any_SSE2(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+void YToARGBRow_Any_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width);
+
+// ARGB preattenuated alpha blend.
+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// ARGB multiply images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// ARGB add images.
+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+// ARGB subtract images. Same API as Blend, but these require
+// pointer and width alignment for SSE2.
+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);
+
+void I444ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I411ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGBARow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGB24Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRAWRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void NV12ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToARGBRow_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void NV21ToRGB565Row_Any_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width);
+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width);
+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width);
+
+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix);
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);
+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_NEON(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);
+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix);
+
+void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix);
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+
+void I422ToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width);
+
+// Effects related row functions.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+ int width);
+
+// Inverse table for unattenuate, shared by C and SSE2.
+extern uint32 fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width);
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+ int width);
+void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
+ int width);
+
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width);
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value);
+
+// Used for blur.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width);
+
+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count);
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width);
+
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width);
+
+// Used for I420Scale, ARGBScale, and ARGBInterpolate.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr,
+ int width, int source_y_fraction);
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
+
+// Sobel images.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+ uint8* dst_sobelx, int width);
+void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width);
+void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width);
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width);
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROW_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libyuv/include/libyuv/scale.h
new file mode 100644
index 00000000000..b1efc95d2fd
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/scale.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported filtering
+enum FilterMode {
+ kFilterNone = 0, // Point sample; Fastest.
+ kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
+ kFilterBox = 2 // Highest quality.
+};
+
+// Scale a YUV plane.
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+// Scales a YUV 4:2:0 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+// Legacy API. Deprecated.
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+ int src_stride_y, int src_stride_u, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, uint8* dst_u, uint8* dst_v,
+ int dst_stride_y, int dst_stride_u, int dst_stride_v,
+ int dst_width, int dst_height,
+ bool interpolate);
+
+// Legacy API. Deprecated.
+LIBYUV_API
+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
+ uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
+ bool interpolate);
+
+// For testing, allow disabling of specialized scalers.
+LIBYUV_API
+void SetUseReferenceImpl(bool use);
+#endif // __cplusplus
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_argb.h b/chromium/third_party/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 00000000000..b6f510522e7
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h
new file mode 100644
index 00000000000..31cf78fc591
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 723
+
+#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/video_common.h b/chromium/third_party/libyuv/include/libyuv/video_common.h
new file mode 100644
index 00000000000..d118cacfb06
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) ( \
+ (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+ (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+ ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+ ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+// http://www.fourcc.org/yuv.php
+// http://v4l2spec.bytesex.org/spec/book1.htm
+// http://developer.apple.com/quicktime/icefloe/dispatch020.html
+// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+ // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+ FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+ FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+ FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+ FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+ FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+ FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+ FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+ FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+ // 2 Secondary YUV formats: row biplanar.
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+
+ // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+ FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+ FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+ FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+ FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // bgr565.
+ FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // abgr1555.
+ FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444.
+
+ // 4 Secondary RGB formats: 4 Bayer Patterns.
+ FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+ FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+ FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+ FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+ // 1 Primary Compressed YUV format.
+ FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+ // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+ FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+ FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+ FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
+ FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+ FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+ // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
+ FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
+ FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422.
+ FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444.
+ FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2.
+ FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac.
+ FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY.
+ FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac.
+ FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG.
+ FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac.
+ FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR.
+ FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW.
+ FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG.
+ FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB
+ FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB
+
+ // 1 Auxiliary compressed YUV format set aside for capturer.
+ FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+ // Match any fourcc.
+ FOURCC_ANY = 0xFFFFFFFF,
+};
+
+enum FourCCBpp {
+ // Canonical fourcc codes used in our code.
+ FOURCC_BPP_I420 = 12,
+ FOURCC_BPP_I422 = 16,
+ FOURCC_BPP_I444 = 24,
+ FOURCC_BPP_I411 = 12,
+ FOURCC_BPP_I400 = 8,
+ FOURCC_BPP_NV21 = 12,
+ FOURCC_BPP_NV12 = 12,
+ FOURCC_BPP_YUY2 = 16,
+ FOURCC_BPP_UYVY = 16,
+ FOURCC_BPP_M420 = 12,
+ FOURCC_BPP_Q420 = 12,
+ FOURCC_BPP_ARGB = 32,
+ FOURCC_BPP_BGRA = 32,
+ FOURCC_BPP_ABGR = 32,
+ FOURCC_BPP_RGBA = 32,
+ FOURCC_BPP_24BG = 24,
+ FOURCC_BPP_RAW = 24,
+ FOURCC_BPP_RGBP = 16,
+ FOURCC_BPP_RGBO = 16,
+ FOURCC_BPP_R444 = 16,
+ FOURCC_BPP_RGGB = 8,
+ FOURCC_BPP_BGGR = 8,
+ FOURCC_BPP_GRBG = 8,
+ FOURCC_BPP_GBRG = 8,
+ FOURCC_BPP_YV12 = 12,
+ FOURCC_BPP_YV16 = 16,
+ FOURCC_BPP_YV24 = 24,
+ FOURCC_BPP_YU12 = 12,
+ FOURCC_BPP_J420 = 12,
+ FOURCC_BPP_J400 = 8,
+ FOURCC_BPP_MJPG = 0, // 0 means unknown.
+ FOURCC_BPP_H264 = 0,
+ FOURCC_BPP_IYUV = 12,
+ FOURCC_BPP_YU16 = 16,
+ FOURCC_BPP_YU24 = 24,
+ FOURCC_BPP_YUYV = 16,
+ FOURCC_BPP_YUVS = 16,
+ FOURCC_BPP_HDYC = 16,
+ FOURCC_BPP_2VUY = 16,
+ FOURCC_BPP_JPEG = 1,
+ FOURCC_BPP_DMB1 = 1,
+ FOURCC_BPP_BA81 = 8,
+ FOURCC_BPP_RGB3 = 24,
+ FOURCC_BPP_BGR3 = 24,
+ FOURCC_BPP_CM32 = 32,
+ FOURCC_BPP_CM24 = 24,
+
+ // Match any fourcc.
+ FOURCC_BPP_ANY = 0, // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT
diff --git a/chromium/third_party/libyuv/libyuv.gyp b/chromium/third_party/libyuv/libyuv.gyp
new file mode 100644
index 00000000000..ad6b78b5c3e
--- /dev/null
+++ b/chromium/third_party/libyuv/libyuv.gyp
@@ -0,0 +1,123 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+ 'variables': {
+ 'use_system_libjpeg%': 0,
+ },
+ 'targets': [
+ {
+ 'target_name': 'libyuv',
+ # Change type to 'shared_library' to build .so or .dll files.
+ 'type': 'static_library',
+ # Allows libyuv.a redistributable library without external dependencies.
+ 'standalone_static_library': 1,
+ 'conditions': [
+ # TODO(fbarchard): Use gyp define to enable jpeg.
+ [ 'OS != "ios"', {
+ 'defines': [
+ 'HAVE_JPEG'
+ ],
+ 'conditions': [
+ [ 'use_system_libjpeg==0', {
+ 'dependencies': [
+ '<(DEPTH)/third_party/libjpeg_turbo/libjpeg.gyp:libjpeg',
+ ],
+ }, {
+ 'link_settings': {
+ 'libraries': [
+ '-ljpeg',
+ ],
+ },
+ }],
+ ],
+ }],
+ ],
+ 'defines': [
+ # Enable the following 3 macros to turn off assembly for specified CPU.
+ # 'LIBYUV_DISABLE_X86',
+ # 'LIBYUV_DISABLE_NEON',
+ # 'LIBYUV_DISABLE_MIPS',
+ # Enable the following macro to build libyuv as a shared library (dll).
+ # 'LIBYUV_USING_SHARED_LIBRARY',
+ ],
+ 'include_dirs': [
+ 'include',
+ '.',
+ ],
+ 'direct_dependent_settings': {
+ 'include_dirs': [
+ 'include',
+ '.',
+ ],
+ },
+ 'sources': [
+ # includes.
+ 'include/libyuv.h',
+ 'include/libyuv/basic_types.h',
+ 'include/libyuv/compare.h',
+ 'include/libyuv/convert.h',
+ 'include/libyuv/convert_argb.h',
+ 'include/libyuv/convert_from.h',
+ 'include/libyuv/convert_from_argb.h',
+ 'include/libyuv/cpu_id.h',
+ 'include/libyuv/format_conversion.h',
+ 'include/libyuv/mjpeg_decoder.h',
+ 'include/libyuv/planar_functions.h',
+ 'include/libyuv/rotate.h',
+ 'include/libyuv/rotate_argb.h',
+ 'include/libyuv/row.h',
+ 'include/libyuv/scale.h',
+ 'include/libyuv/scale_argb.h',
+ 'include/libyuv/version.h',
+ 'include/libyuv/video_common.h',
+
+ # sources.
+ 'source/compare.cc',
+ 'source/compare_common.cc',
+ 'source/compare_neon.cc',
+ 'source/compare_posix.cc',
+ 'source/compare_win.cc',
+ 'source/convert.cc',
+ 'source/convert_argb.cc',
+ 'source/convert_from.cc',
+ 'source/convert_from_argb.cc',
+ 'source/convert_jpeg.cc',
+ 'source/convert_to_argb.cc',
+ 'source/convert_to_i420.cc',
+ 'source/cpu_id.cc',
+ 'source/format_conversion.cc',
+ 'source/mjpeg_decoder.cc',
+ 'source/mjpeg_validate.cc',
+ 'source/planar_functions.cc',
+ 'source/rotate.cc',
+ 'source/rotate_argb.cc',
+ 'source/rotate_mips.cc',
+ 'source/rotate_neon.cc',
+ 'source/row_any.cc',
+ 'source/row_common.cc',
+ 'source/row_mips.cc',
+ 'source/row_neon.cc',
+ 'source/row_posix.cc',
+ 'source/row_win.cc',
+ 'source/scale.cc',
+ 'source/scale_argb.cc',
+ 'source/scale_argb_neon.cc',
+ 'source/scale_mips.cc',
+ 'source/scale_neon.cc',
+ 'source/video_common.cc',
+ ],
+ },
+ ], # targets.
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/chromium/third_party/libyuv/libyuv_test.gyp b/chromium/third_party/libyuv/libyuv_test.gyp
new file mode 100644
index 00000000000..447881a4480
--- /dev/null
+++ b/chromium/third_party/libyuv/libyuv_test.gyp
@@ -0,0 +1,126 @@
+# Copyright 2011 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+{
+ 'targets': [
+ {
+ 'target_name': 'libyuv_unittest',
+ 'type': 'executable',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ # The tests are based on gtest
+ 'testing/gtest.gyp:gtest',
+ 'testing/gtest.gyp:gtest_main',
+ ],
+ 'defines': [
+ 'LIBYUV_SVNREVISION="<!(svnversion -n)"',
+ # Enable the following 3 macros to turn off assembly for specified CPU.
+ # 'LIBYUV_DISABLE_X86',
+ # 'LIBYUV_DISABLE_NEON',
+ # 'LIBYUV_DISABLE_MIPS',
+ # Enable the following macro to build libyuv as a shared library (dll).
+ # 'LIBYUV_USING_SHARED_LIBRARY',
+ ],
+ 'sources': [
+ # headers
+ 'unit_test/unit_test.h',
+
+ # sources
+ 'unit_test/basictypes_test.cc',
+ 'unit_test/compare_test.cc',
+ 'unit_test/convert_test.cc',
+ 'unit_test/cpu_test.cc',
+ 'unit_test/planar_test.cc',
+ 'unit_test/rotate_argb_test.cc',
+ 'unit_test/rotate_test.cc',
+ 'unit_test/scale_argb_test.cc',
+ 'unit_test/scale_test.cc',
+ 'unit_test/unit_test.cc',
+ 'unit_test/video_common_test.cc',
+ 'unit_test/version_test.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ [ 'OS != "ios"', {
+ 'defines': [
+ 'HAVE_JPEG',
+ ],
+ }],
+ ], # conditions
+ },
+
+ {
+ 'target_name': 'compare',
+ 'type': 'executable',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ 'sources': [
+ # sources
+ 'util/compare.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ ], # conditions
+ },
+ {
+ 'target_name': 'convert',
+ 'type': 'executable',
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ 'sources': [
+ # sources
+ 'util/convert.cc',
+ ],
+ 'conditions': [
+ ['OS=="linux"', {
+ 'cflags': [
+ '-fexceptions',
+ ],
+ }],
+ ], # conditions
+ },
+ # TODO(fbarchard): Enable SSE2 and OpenMP for better performance.
+ {
+ 'target_name': 'psnr',
+ 'type': 'executable',
+ 'sources': [
+ # sources
+ 'util/psnr_main.cc',
+ 'util/psnr.cc',
+ 'util/ssim.cc',
+ ],
+ },
+ {
+ 'target_name': 'cpuid',
+ 'type': 'executable',
+ 'sources': [
+ # sources
+ 'util/cpuid.c',
+ ],
+ 'dependencies': [
+ 'libyuv.gyp:libyuv',
+ ],
+ },
+ ], # targets
+}
+
+# Local Variables:
+# tab-width:2
+# indent-tabs-mode:nil
+# End:
+# vim: set expandtab tabstop=2 shiftwidth=2:
diff --git a/chromium/third_party/libyuv/public.mk b/chromium/third_party/libyuv/public.mk
new file mode 100644
index 00000000000..090d8cb659f
--- /dev/null
+++ b/chromium/third_party/libyuv/public.mk
@@ -0,0 +1,13 @@
+# This file contains all the common make variables which are useful for
+# anyone depending on this library.
+# Note that dependencies on NDK are not directly listed since NDK auto adds
+# them.
+
+LIBYUV_INCLUDES := $(LIBYUV_PATH)/include
+
+LIBYUV_C_FLAGS :=
+
+LIBYUV_CPP_FLAGS :=
+
+LIBYUV_LDLIBS :=
+LIBYUV_DEP_MODULES :=
diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc
new file mode 100644
index 00000000000..f8b358309e5
--- /dev/null
+++ b/chromium/third_party/libyuv/source/compare.cc
@@ -0,0 +1,323 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#endif // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+ uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ HashDjb2_SSE = HashDjb2_SSE41;
+ }
+#endif
+
+ const int kBlockSize = 1 << 15; // 32768;
+ while (count >= static_cast<uint64>(kBlockSize)) {
+ seed = HashDjb2_SSE(src, kBlockSize, seed);
+ src += kBlockSize;
+ count -= kBlockSize;
+ }
+ int remainder = static_cast<int>(count) & ~15;
+ if (remainder) {
+ seed = HashDjb2_SSE(src, remainder, seed);
+ src += remainder;
+ count -= remainder;
+ }
+ remainder = static_cast<int>(count) & 15;
+ if (remainder) {
+ seed = HashDjb2_C(src, remainder, seed);
+ }
+ return seed;
+}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
+ defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+// Visual C 2012 required for AVX2.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+ int count) {
+ uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+ SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SumSquareError = SumSquareError_NEON;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+ // Note only used for multiples of 16 so count is not checked.
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ // Note only used for multiples of 32 so count is not checked.
+ SumSquareError = SumSquareError_AVX2;
+ }
+#endif
+ // SumSquareError returns values 0 to 65535 for each squared difference.
+ // Up to 65536 of those can be summed and remain within a uint32.
+ // After each block of 65536 pixels, accumulate into a uint64.
+ const int kBlockSize = 65536;
+ uint64 sse = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+ for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ int remainder = count & (kBlockSize - 1) & ~31;
+ if (remainder) {
+ sse += SumSquareError(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & 31;
+ if (remainder) {
+ sse += SumSquareError_C(src_a, src_b, remainder);
+ }
+ return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ if (stride_a == width && stride_b == width) {
+ return ComputeSumSquareError(src_a, src_b, width * height);
+ }
+ uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+ SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SumSquareError = SumSquareError_NEON;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
+ IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ SumSquareError = SumSquareError_AVX2;
+ }
+#endif
+ uint64 sse = 0;
+ for (int h = 0; h < height; ++h) {
+ sse += SumSquareError(src_a, src_b, width);
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+ return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+ double psnr;
+ if (sse > 0) {
+ double mse = static_cast<double>(count) / static_cast<double>(sse);
+ psnr = 10.0 * log10(255.0 * 255.0 * mse);
+ } else {
+ psnr = kMaxPsnr; // Limit to prevent divide by 0
+ }
+
+ if (psnr > kMaxPsnr)
+ psnr = kMaxPsnr;
+
+ return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ const uint64 samples = width * height;
+ const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+ src_b, stride_b,
+ width, height);
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height) {
+ const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+ src_y_b, stride_y_b,
+ width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+ src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+ src_v_b, stride_v_b,
+ width_uv, height_uv);
+ const uint64 samples = width * height + 2 * (width_uv * height_uv);
+ const uint64 sse = sse_y + sse_u + sse_v;
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 = 26634; // (64^2*(.01*255)^2
+static const int64 cc2 = 239708; // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b) {
+ int64 sum_a = 0;
+ int64 sum_b = 0;
+ int64 sum_sq_a = 0;
+ int64 sum_sq_b = 0;
+ int64 sum_axb = 0;
+
+ for (int i = 0; i < 8; ++i) {
+ for (int j = 0; j < 8; ++j) {
+ sum_a += src_a[j];
+ sum_b += src_b[j];
+ sum_sq_a += src_a[j] * src_a[j];
+ sum_sq_b += src_b[j] * src_b[j];
+ sum_axb += src_a[j] * src_b[j];
+ }
+
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+
+ const int64 count = 64;
+ // scale the constants by number of pixels
+ const int64 c1 = (cc1 * count * count) >> 12;
+ const int64 c2 = (cc2 * count * count) >> 12;
+
+ const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+ const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+ (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+ const int64 sum_a_sq = sum_a*sum_a;
+ const int64 sum_b_sq = sum_b*sum_b;
+
+ const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+ (count * sum_sq_a - sum_a_sq +
+ count * sum_sq_b - sum_b_sq + c2);
+
+ if (ssim_d == 0.0)
+ return DBL_MAX;
+ return ssim_n * 1.0 / ssim_d;
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ int samples = 0;
+ double ssim_total = 0;
+
+ double (*Ssim8x8)(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b);
+
+ Ssim8x8 = Ssim8x8_C;
+
+ // sample point start with each 4x4 location
+ for (int i = 0; i < height - 8; i += 4) {
+ for (int j = 0; j < width - 8; j += 4) {
+ ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+ samples++;
+ }
+
+ src_a += stride_a * 4;
+ src_b += stride_b * 4;
+ }
+
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height) {
+ const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+ src_y_b, stride_y_b, width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+ src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+ src_v_b, stride_v_b,
+ width_uv, height_uv);
+ return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_common.cc b/chromium/third_party/libyuv/source/compare_common.cc
new file mode 100644
index 00000000000..ab587d08171
--- /dev/null
+++ b/chromium/third_party/libyuv/source/compare_common.cc
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse = 0u;
+ for (int i = 0; i < count; ++i) {
+ int diff = src_a[i] - src_b[i];
+ sse += static_cast<uint32>(diff * diff);
+ }
+ return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+ uint32 hash = seed;
+ for (int i = 0; i < count; ++i) {
+ hash += (hash << 5) + src[i];
+ }
+ return hash;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc
new file mode 100644
index 00000000000..a4e77750631
--- /dev/null
+++ b/chromium/third_party/libyuv/source/compare_neon.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_posix.cc b/chromium/third_party/libyuv/source/compare_posix.cc
new file mode 100644
index 00000000000..f24835d7714
--- /dev/null
+++ b/chromium/third_party/libyuv/source/compare_posix.cc
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile (
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm1 \n"
+ "movdqa (%0,%1,1),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+ return sse;
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
+#define HAS_HASHDJB2_SSE41
+CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+CONST uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+CONST uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+CONST uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+CONST uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ uint32 hash;
+ asm volatile (
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "sub $0x10,%1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+ return hash;
+}
+#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_win.cc b/chromium/third_party/libyuv/source/compare_win.cc
new file mode 100644
index 00000000000..e576e85c192
--- /dev/null
+++ b/chromium/third_party/libyuv/source/compare_win.cc
@@ -0,0 +1,192 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+ sub edx, eax
+
+ align 16
+ wloop:
+ movdqa xmm1, [eax]
+ movdqa xmm2, [eax + edx]
+ lea eax, [eax + 16]
+ sub ecx, 16
+ movdqa xmm3, xmm1 // abs trick
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ jg wloop
+
+ pshufd xmm1, xmm0, 0xee
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 0x01
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ vpxor ymm0, ymm0, ymm0 // sum
+ vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
+ sub edx, eax
+
+ align 16
+ wloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + edx]
+ lea eax, [eax + 32]
+ sub ecx, 32
+ vpsubusb ymm3, ymm1, ymm2 // abs difference trick
+ vpsubusb ymm2, ymm2, ymm1
+ vpor ymm1, ymm2, ymm3
+ vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
+ vpunpckhbw ymm1, ymm1, ymm5
+ vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
+ vpmaddwd ymm1, ymm1, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm2
+ jg wloop
+
+ vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpermq ymm1, ymm0, 0x02 // high + low lane.
+ vpaddd ymm0, ymm0, ymm1
+ vmovd eax, xmm0
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static const uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static const uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static const uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static const uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
+// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
+// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
+// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
+// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+ _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+
+ pxor xmm7, xmm7 // constant 0 for unpck
+ movdqa xmm6, kHash16x33
+
+ align 16
+ wloop:
+ movdqu xmm1, [eax] // src[0-15]
+ lea eax, [eax + 16]
+ pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
+ movdqa xmm5, kHashMul0
+ movdqa xmm2, xmm1
+ punpcklbw xmm2, xmm7 // src[0-7]
+ movdqa xmm3, xmm2
+ punpcklwd xmm3, xmm7 // src[0-3]
+ pmulld(0xdd) // pmulld xmm3, xmm5
+ movdqa xmm5, kHashMul1
+ movdqa xmm4, xmm2
+ punpckhwd xmm4, xmm7 // src[4-7]
+ pmulld(0xe5) // pmulld xmm4, xmm5
+ movdqa xmm5, kHashMul2
+ punpckhbw xmm1, xmm7 // src[8-15]
+ movdqa xmm2, xmm1
+ punpcklwd xmm2, xmm7 // src[8-11]
+ pmulld(0xd5) // pmulld xmm2, xmm5
+ movdqa xmm5, kHashMul3
+ punpckhwd xmm1, xmm7 // src[12-15]
+ pmulld(0xcd) // pmulld xmm1, xmm5
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ sub ecx, 16
+ paddd xmm1, xmm3
+
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert.cc b/chromium/third_party/libyuv/source/convert.cc
new file mode 100644
index 00000000000..980df7edd5e
--- /dev/null
+++ b/chromium/third_party/libyuv/source/convert.cc
@@ -0,0 +1,1568 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy I420 with optional flipping
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ int halfwidth = (width + 1) >> 1;
+ void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) = HalfRow_C;
+#if defined(HAS_HALFROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
+ IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ HalfRow = HalfRow_SSE2;
+ }
+#endif
+#if defined(HAS_HALFROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
+ HalfRow = HalfRow_AVX2;
+ }
+#endif
+#if defined(HAS_HALFROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
+ HalfRow = HalfRow_NEON;
+ }
+#endif
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // SubSample U plane.
+ int y;
+ for (y = 0; y < height - 1; y += 2) {
+ HalfRow(src_u, src_stride_u, dst_u, halfwidth);
+ src_u += src_stride_u * 2;
+ dst_u += dst_stride_u;
+ }
+ if (height & 1) {
+ HalfRow(src_u, 0, dst_u, halfwidth);
+ }
+
+ // SubSample V plane.
+ for (y = 0; y < height - 1; y += 2) {
+ HalfRow(src_v, src_stride_v, dst_v, halfwidth);
+ src_v += src_stride_v * 2;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ HalfRow(src_v, 0, dst_v, halfwidth);
+ }
+ return 0;
+}
+
+// Blends 32x2 pixels to 16x1
+// source in scale.cc
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEROWDOWN2_NEON
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+#elif !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+#endif
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ int halfwidth = (width + 1) >> 1;
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(halfwidth, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(halfwidth, 16) &&
+ IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ ScaleRowDown2 = ScaleRowDown2Box_SSE2;
+ }
+#endif
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // SubSample U plane.
+ int y;
+ for (y = 0; y < height - 1; y += 2) {
+ ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth);
+ src_u += src_stride_u * 2;
+ dst_u += dst_stride_u;
+ }
+ if (height & 1) {
+ ScaleRowDown2(src_u, 0, dst_u, halfwidth);
+ }
+
+ // SubSample V plane.
+ for (y = 0; y < height - 1; y += 2) {
+ ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth);
+ src_v += src_stride_v * 2;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ScaleRowDown2(src_v, 0, dst_v, halfwidth);
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ int quarterwidth = (width + 3) >> 2;
+
+ // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height.
+ ScalePlane(src_u, src_stride_u, quarterwidth, height,
+ dst_u, dst_stride_u, halfwidth, halfheight,
+ kFilterNone);
+
+ // Resample V plane.
+ ScalePlane(src_v, src_stride_v, quarterwidth, height,
+ dst_v, dst_stride_v, halfwidth, halfheight,
+ kFilterNone);
+ return 0;
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+ SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+ return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src, 16) &&
+ IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ // TODO(fbarchard): Detect Fast String support.
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (int y = 0; y < height - 1; y += 2) {
+ CopyRow(src, dst, width);
+ CopyRow(src + src_stride_0, dst + dst_stride, width);
+ src += src_stride_0 + src_stride_1;
+ dst += dst_stride * 2;
+ }
+ if (height & 1) {
+ CopyRow(src, dst, width);
+ }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+// The UV plane is half width, but 2 values, so src_stride_m420 applies to
+// this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+ int src_stride_y0, int src_stride_y1,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_uv ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce contiguous rows.
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (src_stride_y0 == width &&
+ src_stride_y1 == width &&
+ dst_stride_y == width) {
+ width = width * height;
+ height = 1;
+ }
+ if (src_stride_uv == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ halfwidth = halfwidth * halfheight;
+ halfheight = 1;
+ }
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+ SplitUVRow_C;
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
+ SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
+ if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+ IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+ IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+ SplitUVRow = SplitUVRow_MIPS_DSPR2;
+ }
+ }
+ }
+#endif
+
+ if (dst_y) {
+ if (src_stride_y0 == src_stride_y1) {
+ CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+ } else {
+ CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+ width, height);
+ }
+ }
+
+ for (int y = 0; y < halfheight; ++y) {
+ // Copy a row of UV.
+ SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
+ return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y,
+ src_uv, src_stride_uv,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+}
+
+// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y,
+ src_vu, src_stride_vu,
+ dst_y, dst_stride_y,
+ dst_v, dst_stride_v,
+ dst_u, dst_stride_u,
+ width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+ src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+}
+
+// Convert Q420 to I420.
+// Format is rows of YY/YUYV
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_yuy2 ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) = YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+ YUY2ToYRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ CopyRow(src_y, dst_y, width);
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ }
+ return 0;
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*YUY2ToYRow)(const uint8* src_yuy2,
+ uint8* dst_y, int pix);
+ YUY2ToYRow = YUY2ToYRow_C;
+ YUY2ToUVRow = YUY2ToUVRow_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUVRow = YUY2ToUVRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width >= 16) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+ UYVYToYRow = UYVYToYRow_C;
+ UYVYToUVRow = UYVYToUVRow_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUVRow = UYVYToUVRow_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width >= 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_argb ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_bgra ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+ void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+ void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+ BGRAToYRow_C;
+#if defined(HAS_BGRATOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+ BGRAToYRow = BGRAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
+ BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ BGRAToYRow = BGRAToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_BGRATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ BGRAToYRow = BGRAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_NEON;
+ }
+ if (width >= 16) {
+ BGRAToUVRow = BGRAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+ src_bgra += src_stride_bgra * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_abgr ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+ void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+ ABGRToYRow_C;
+#if defined(HAS_ABGRTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
+ ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ if (width >= 16) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_rgba ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+ void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+ void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+ RGBAToYRow_C;
+#if defined(HAS_RGBATOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ RGBAToYRow = RGBAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
+ RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ RGBAToYRow = RGBAToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_RGBATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGBAToYRow = RGBAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_NEON;
+ }
+ if (width >= 16) {
+ RGBAToUVRow = RGBAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+ src_rgba += src_stride_rgba * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0 ||
+ width * 4 > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+#if defined(HAS_RGB24TOYROW_NEON)
+ void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+ void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+ RGB24ToYRow_C;
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB24ToYRow = RGB24ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_NEON;
+ }
+ if (width >= 16) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_RGB24TOYROW_NEON
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_RGB24TOYROW_NEON
+
+ for (int y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+ return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_raw || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0 ||
+ width * 4 > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+#if defined(HAS_RAWTOYROW_NEON)
+ void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+ void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+ RAWToYRow_C;
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RAWToYRow = RAWToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_NEON;
+ }
+ if (width >= 16) {
+ RAWToUVRow = RAWToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_RAWTOYROW_NEON
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RAWToARGBRow_C;
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_RAWTOYROW_NEON
+
+ for (int y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+ return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0 ||
+ width * 4 > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+
+#if defined(HAS_RGB565TOYROW_NEON)
+ void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+ void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+ RGB565ToYRow_C;
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB565ToYRow = RGB565ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_NEON;
+ }
+ if (width >= 16) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_RGB565TOYROW_NEON
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RGB565ToARGBRow_C;
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_RGB565TOYROW_NEON
+
+ for (int y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0 ||
+ width * 4 > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+ void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
+ ARGB1555ToYRow_C;
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_ARGB1555TOYROW_NEON
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ ARGB1555ToARGBRow_C;
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_ARGB1555TOYROW_NEON
+
+ for (int y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kMaxStride,
+ width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0 ||
+ width * 4 > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+ void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
+ ARGB4444ToYRow_C;
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_ARGB4444TOYROW_NEON
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ ARGB4444ToARGBRow_C;
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_ARGB4444TOYROW_NEON
+
+ for (int y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kMaxStride,
+ width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+#endif
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libyuv/source/convert_argb.cc
new file mode 100644
index 00000000000..55d4d6904ce
--- /dev/null
+++ b/chromium/third_party/libyuv/source/convert_argb.cc
@@ -0,0 +1,909 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width * 4, height);
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u == width &&
+ src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ return I444ToARGB(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*I444ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I444ToARGBRow_C;
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_argb == width * 4) {
+ return I422ToARGB(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u * 4 == width &&
+ src_stride_v * 4 == width &&
+ dst_stride_argb == width * 4) {
+ return I411ToARGB(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*I411ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I411ToARGBRow_C;
+#if defined(HAS_I411TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I411ToARGBRow = I411ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I411TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I411ToARGBRow = I411ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I411ToARGBRow = I411ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ return I400ToARGB_Reference(src_y, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*YToARGBRow)(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) = YToARGBRow_C;
+#if defined(HAS_YTOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ YToARGBRow = YToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ YToARGBRow = YToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_YTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YToARGBRow = YToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ YToARGBRow = YToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ YToARGBRow(src_y, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ return I400ToARGB(src_y, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+ I400ToARGBRow_C;
+#if defined(HAS_I400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I400ToARGBRow = I400ToARGBRow_SSE2;
+ }
+ }
+ }
+#elif defined(HAS_I400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I400ToARGBRow = I400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_NEON;
+ }
+ }
+#endif
+ for (int y = 0; y < height; ++y) {
+ I400ToARGBRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static const uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static const uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static const uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra,
+ dst_argb, dst_stride_argb,
+ reinterpret_cast<const uint8*>(&kShuffleMaskBGRAToARGB),
+ width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr,
+ dst_argb, dst_stride_argb,
+ reinterpret_cast<const uint8*>(&kShuffleMaskABGRToARGB),
+ width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_rgba, src_stride_rgba,
+ dst_argb, dst_stride_argb,
+ reinterpret_cast<const uint8*>(&kShuffleMaskRGBAToARGB),
+ width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_rgb24 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_rgb24 == width * 3 &&
+ dst_stride_argb == width * 4) {
+ return RGB24ToARGB(src_rgb24, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RGB24ToARGBRow_C;
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#elif defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ RGB24ToARGBRow(src_rgb24, dst_argb, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_raw || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_raw == width * 3 &&
+ dst_stride_argb == width * 4) {
+ return RAWToARGB(src_raw, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RAWToARGBRow_C;
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#elif defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_rgb565 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_rgb565 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ return RGB565ToARGB(src_rgb565, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+ RGB565ToARGBRow_C;
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_RGB565TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ RGB565ToARGBRow(src_rgb565, dst_argb, width);
+ src_rgb565 += src_stride_rgb565;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb1555 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb1555 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ return ARGB1555ToARGB(src_argb1555, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) = ARGB1555ToARGBRow_C;
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_ARGB1555TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+ src_argb1555 += src_stride_argb1555;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb4444 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb4444 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ return ARGB4444ToARGB(src_argb4444, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) = ARGB4444ToARGBRow_C;
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_ARGB4444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+ src_argb4444 += src_stride_argb4444;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_uv || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_uv || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*NV21ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV21ToARGBRow_C;
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_m420 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToARGBRow_C;
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+ dst_argb + dst_stride_argb, width);
+ dst_argb += dst_stride_argb * 2;
+ src_m420 += src_stride_m420 * 3;
+ }
+ if (height & 1) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ }
+ return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_yuy2 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce contiguous rows.
+ if (width * height <= kMaxStride &&
+ src_stride_yuy2 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ return YUY2ToARGB(src_yuy2, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+ YUY2ToARGBRow_C;
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ // Posix is 16, Windows is 8.
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && width <= kMaxStride) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_YUY2TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+ }
+ }
+#endif
+ for (int y = 0; y < height; ++y) {
+ YUY2ToARGBRow(src_yuy2, dst_argb, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_uyvy || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce contiguous rows.
+ if (width * height <= kMaxStride &&
+ src_stride_uyvy == width * 2 &&
+ dst_stride_argb == width * 4) {
+ return UYVYToARGB(src_uyvy, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+ UYVYToARGBRow_C;
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+ // Posix is 16, Windows is 8.
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && width <= kMaxStride) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_UYVYTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_NEON;
+ }
+ }
+#endif
+ for (int y = 0; y < height; ++y) {
+ UYVYToARGBRow(src_uyvy, dst_argb, width);
+ src_uyvy += src_stride_uyvy;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc
new file mode 100644
index 00000000000..87f9b5cb726
--- /dev/null
+++ b/chromium/third_party/libyuv/source/convert_from.cc
@@ -0,0 +1,1278 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h" // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ int halfwidth = (width + 1) >> 1;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_X86)
+ if (IS_ALIGNED(halfwidth, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
+ IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ // UpSample U plane.
+ int y;
+ for (y = 0; y < height - 1; y += 2) {
+ CopyRow(src_u, dst_u, halfwidth);
+ CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
+ src_u += src_stride_u;
+ dst_u += dst_stride_u * 2;
+ }
+ if (height & 1) {
+ CopyRow(src_u, dst_u, halfwidth);
+ }
+
+ // UpSample V plane.
+ for (y = 0; y < height - 1; y += 2) {
+ CopyRow(src_v, dst_v, halfwidth);
+ CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
+ src_v += src_stride_v;
+ dst_v += dst_stride_v * 2;
+ }
+ if (height & 1) {
+ CopyRow(src_v, dst_v, halfwidth);
+ }
+ return 0;
+}
+
+// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u|| !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height.
+ ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
+ dst_u, dst_stride_u, width, height,
+ kFilterNone);
+
+ // Upsample V plane.
+ ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
+ dst_v, dst_stride_v, width, height,
+ kFilterNone);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (height - 1) * dst_stride_u;
+ dst_v = dst_v + (height - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+
+ // Copy Y plane
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ int quarterwidth = (width + 3) >> 2;
+
+ // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height
+ ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
+ dst_u, dst_stride_u, quarterwidth, height,
+ kFilterNone);
+
+ // Resample V plane.
+ ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
+ dst_v, dst_stride_v, quarterwidth, height,
+ kFilterNone);
+ return 0;
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_yuy2 == width * 2) {
+ return I422ToYUY2(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_yuy2, 0,
+ width * height, 1);
+ }
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+ dst_yuy2 + dst_stride_yuy2, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2 * 2;
+ }
+ if (height & 1) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_uyvy ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_uyvy == width * 2) {
+ return I422ToUYVY(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_uyvy, 0,
+ width * height, 1);
+ }
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_uyvy ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+ dst_uyvy + dst_stride_uyvy, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy * 2;
+ }
+ if (height & 1) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_uv = -dst_stride_uv;
+ }
+ // Coalesce contiguous rows.
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ width = width * height;
+ height = 1;
+ }
+ if (src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_uv == width) {
+ halfwidth = halfwidth * halfheight;
+ halfheight = 1;
+ }
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ for (int y = 0; y < halfheight; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += dst_stride_uv;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height) {
+ return I420ToNV12(src_y, src_stride_y,
+ src_v, src_stride_v,
+ src_u, src_stride_u,
+ dst_y, src_stride_y,
+ dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_bgra ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
+ }
+ void (*I422ToBGRARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToBGRARow_C;
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+ I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+#if defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_rgb24 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+ void (*I422ToRGB24Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGB24Row_C;
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_raw, int dst_stride_raw,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_raw ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+ dst_stride_raw = -dst_stride_raw;
+ }
+ void (*I422ToRAWRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRAWRow_C;
+#if defined(HAS_I422TORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRAWRow = I422ToRAWRow_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRAWRow = I422ToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRAWRow = I422ToRAWRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+ dst_raw += dst_stride_raw;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_argb1555 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+ void (*I422ToARGB1555Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGB1555Row_C;
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width * 4 <= kMaxStride) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+ void (*I422ToARGB4444Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGB4444Row_C;
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width * 4 <= kMaxStride) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ void (*I422ToRGB565Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGB565Row_C;
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8
+#if defined(__x86_64__) || defined(__i386__)
+ && width * 4 <= kMaxStride
+#endif
+ ) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+ const uint8* u, int u_stride,
+ const uint8* v, int v_stride,
+ uint8* dst_sample, int dst_sample_stride,
+ int width, int height,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ if (!y || !u|| !v || !dst_sample ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ int r = 0;
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ r = I420ToYUY2(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_UYVY:
+ r = I420ToUYVY(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_RGBP:
+ r = I420ToRGB565(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_RGBO:
+ r = I420ToARGB1555(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_R444:
+ r = I420ToARGB4444(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_24BG:
+ r = I420ToRGB24(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3,
+ width, height);
+ break;
+ case FOURCC_RAW:
+ r = I420ToRAW(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3,
+ width, height);
+ break;
+ case FOURCC_ARGB:
+ r = I420ToARGB(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_BGRA:
+ r = I420ToBGRA(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_ABGR:
+ r = I420ToABGR(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_RGBA:
+ r = I420ToRGBA(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_BGGR:
+ r = I420ToBayerBGGR(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_GBRG:
+ r = I420ToBayerGBRG(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_GRBG:
+ r = I420ToBayerGRBG(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_RGGB:
+ r = I420ToBayerRGGB(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_I400:
+ r = I400Copy(y, y_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_NV12: {
+ uint8* dst_uv = dst_sample + width * height;
+ r = I420ToNV12(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ dst_uv,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ }
+ case FOURCC_NV21: {
+ uint8* dst_vu = dst_sample + width * height;
+ r = I420ToNV21(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ dst_vu,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ }
+ // TODO(fbarchard): Add M420 and Q420.
+ // Triplanar formats
+ // TODO(fbarchard): halfstride instead of halfwidth
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV12) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + halfwidth * halfheight;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + halfwidth * halfheight;
+ }
+ r = I420Copy(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, halfwidth,
+ dst_v, halfwidth,
+ width, height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ int halfwidth = (width + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV16) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + halfwidth * height;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + halfwidth * height;
+ }
+ r = I420ToI422(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, halfwidth,
+ dst_v, halfwidth,
+ width, height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV24) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + width * height;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + width * height;
+ }
+ r = I420ToI444(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, width,
+ dst_v, width,
+ width, height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (width + 3) / 4;
+ uint8* dst_u = dst_sample + width * height;
+ uint8* dst_v = dst_u + quarterwidth * height;
+ r = I420ToI411(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, quarterwidth,
+ dst_v, quarterwidth,
+ width, height);
+ break;
+ }
+
+ // Formats not supported - MJPG, biplanar, some rgb formats.
+ default:
+ return -1; // unknown fourcc - return failure code.
+ }
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libyuv/source/convert_from_argb.cc
new file mode 100644
index 00000000000..418f44d0cf5
--- /dev/null
+++ b/chromium/third_party/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1100 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u == width &&
+ dst_stride_v == width) {
+ return ARGBToI444(src_argb, 0,
+ dst_y, 0,
+ dst_u, 0,
+ dst_v, 0,
+ width * height, 1);
+ }
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV444Row_C;
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ return ARGBToI422(src_argb, 0,
+ dst_y, 0,
+ dst_u, 0,
+ dst_v, 0,
+ width * height, 1);
+ }
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+ }
+#endif
+
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u * 4 == width &&
+ dst_stride_v * 4 == width) {
+ return ARGBToI411(src_argb, 0,
+ dst_y, 0,
+ dst_u, 0,
+ dst_v, 0,
+ width * height, 1);
+ }
+ void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV411Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 32) {
+ ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV411Row = ARGBToUV411Row_NEON;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ if (!src_argb ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0 ||
+ width > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+ int halfwidth = (width + 1) >> 1;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ if (!src_argb ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0 ||
+ width > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+ int halfwidth = (width + 1) >> 1;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height) {
+ if (!src_argb || !dst_yuy2 ||
+ width <= 0 || height == 0 ||
+ width > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce contiguous rows.
+ if (width * height <= kMaxStride &&
+ src_stride_argb == width * 4 &&
+ dst_stride_yuy2 == width * 2) {
+ return ARGBToYUY2(src_argb, 0,
+ dst_yuy2, 0,
+ width * height, 1);
+ }
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+ }
+#endif
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+ }
+#endif
+
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+ SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToUV422Row(src_argb, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+ src_argb += src_stride_argb;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+ return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height) {
+ if (!src_argb || !dst_uyvy ||
+ width <= 0 || height == 0 ||
+ width > kMaxStride) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce contiguous rows.
+ if (width * height <= kMaxStride &&
+ src_stride_argb == width * 4 &&
+ dst_stride_uyvy == width * 2) {
+ return ARGBToUYVY(src_argb, 0,
+ dst_uyvy, 0,
+ width * height, 1);
+ }
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+ }
+#endif
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+ }
+#endif
+
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+ SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToUV422Row(src_argb, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+ src_argb += src_stride_argb;
+ dst_uyvy += dst_stride_uyvy;
+ }
+ return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_argb || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width) {
+ return ARGBToI400(src_argb, 0,
+ dst_y, 0,
+ width * height, 1);
+ }
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static const uvec8 kShuffleMaskARGBToRGBA = {
+ 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ return ARGBShuffle(src_argb, src_stride_argb,
+ dst_rgba, dst_stride_rgba,
+ reinterpret_cast<const uint8*>(&kShuffleMaskARGBToRGBA),
+ width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_rgb24 == width * 3) {
+ return ARGBToRGB24(src_argb, 0,
+ dst_rgb24, 0,
+ width * height, 1);
+ }
+ void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB24Row_C;
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToRGB24Row(src_argb, dst_rgb24, width);
+ src_argb += src_stride_argb;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_raw, int dst_stride_raw,
+ int width, int height) {
+ if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_raw == width * 3) {
+ return ARGBToRAW(src_argb, 0,
+ dst_raw, 0,
+ width * height, 1);
+ }
+ void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRAWRow_C;
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRAWRow = ARGBToRAWRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToRAWRow(src_argb, dst_raw, width);
+ src_argb += src_stride_argb;
+ dst_raw += dst_stride_raw;
+ }
+ return 0;
+}
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_rgb565 == width * 2) {
+ return ARGBToRGB565(src_argb, 0,
+ dst_rgb565, 0,
+ width * height, 1);
+ }
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB565Row_C;
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+ }
+#elif defined(HAS_ARGBTORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToRGB565Row(src_argb, dst_rgb565, width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height) {
+ if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb1555 == width * 2) {
+ return ARGBToARGB1555(src_argb, 0,
+ dst_argb1555, 0,
+ width * height, 1);
+ }
+ void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB1555Row_C;
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ }
+ }
+#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+ src_argb += src_stride_argb;
+ dst_argb1555 += dst_stride_argb1555;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height) {
+ if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb4444 == width * 2) {
+ return ARGBToARGB4444(src_argb, 0,
+ dst_argb4444, 0,
+ width * height, 1);
+ }
+ void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB4444Row_C;
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ }
+ }
+#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+ src_argb += src_stride_argb;
+ dst_argb4444 += dst_stride_argb4444;
+ }
+ return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_argb ||
+ !dst_yj || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+ ARGBToYJRow_C;
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height - 1; y += 2) {
+ ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+ src_argb += src_stride_argb * 2;
+ dst_yj += dst_stride_yj * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ int width, int height) {
+ if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_yj == width) {
+ return ARGBToJ400(src_argb, 0,
+ dst_yj, 0,
+ width * height, 1);
+ }
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+ ARGBToYJRow_C;
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert_jpeg.cc b/chromium/third_party/libyuv/source/convert_jpeg.cc
new file mode 100644
index 00000000000..3d1ef98cc34
--- /dev/null
+++ b/chromium/third_party/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,392 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+ uint8* y;
+ int y_stride;
+ uint8* u;
+ int u_stride;
+ uint8* v;
+ int v_stride;
+ int w;
+ int h;
+};
+
+static void JpegCopyI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I420Copy(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I422ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I444ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I411ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = static_cast<I420Buffers*>(opaque);
+ I400ToI420(data[0], strides[0],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+ int* width, int* height) {
+ MJpegDecoder mjpeg_decoder;
+ bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret) {
+ *width = mjpeg_decoder.GetWidth();
+ *height = mjpeg_decoder.GetHeight();
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret ? 0 : -1; // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return 0;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+ uint8* argb;
+ int argb_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I420ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I422ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I444ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I411ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = static_cast<ARGBBuffers*>(opaque);
+ I400ToARGB(data[0], strides[0],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libyuv/source/convert_to_argb.cc
new file mode 100644
index 00000000000..95b6386d719
--- /dev/null
+++ b/chromium/third_party/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+ uint8* dst_argb, int argb_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ if (dst_argb == NULL || sample == NULL ||
+ src_width <= 0 || dst_width <= 0 ||
+ src_height == 0 || dst_height == 0) {
+ return -1;
+ }
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (src_height < 0) {
+ inv_dst_height = -inv_dst_height;
+ }
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination dst_argb is same as source sample,
+ // also enable temporary buffer.
+ bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+ uint8* tmp_argb = dst_argb;
+ int tmp_argb_stride = argb_stride;
+ uint8* buf = NULL;
+ int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (need_buf) {
+ int argb_size = dst_width * abs_dst_height * 4;
+ buf = new uint8[argb_size];
+ if (!buf) {
+ return 1; // Out of memory runtime error.
+ }
+ dst_argb = buf;
+ argb_stride = dst_width;
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToARGB(src, aligned_src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToARGB(src, aligned_src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToARGB(src, src_width * 3,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToARGB(src, src_width * 3,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToARGB(src, src_width * 4,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToARGB(src, src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToARGB(src, src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToARGB(src, src_width * 2,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ // TODO(fbarchard): Support cropping Bayer by odd numbers
+ // by adjusting fourcc.
+ case FOURCC_BGGR:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerBGGRToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_GBRG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGBRGToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_GRBG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGRBGToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_RGGB:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerRGGBToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV21ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToARGB(src, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+// case FOURCC_Q420:
+// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+// src_width + crop_x * 2;
+// r = Q420ToARGB(src, src_width * 3,
+// src_uv, src_width * 3,
+// dst_argb, argb_stride,
+// dst_width, inv_dst_height);
+// break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToARGB(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToARGB(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ dst_argb, argb_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToARGB(sample, sample_size,
+ dst_argb, argb_stride,
+ src_width, abs_src_height, dst_width, inv_dst_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = ARGBRotate(dst_argb, argb_stride,
+ tmp_argb, tmp_argb_stride,
+ dst_width, abs_dst_height, rotation);
+ }
+ delete buf;
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libyuv/source/convert_to_i420.cc
new file mode 100644
index 00000000000..763eb50920e
--- /dev/null
+++ b/chromium/third_party/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,384 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/format_conversion.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+#ifdef HAVE_JPEG
+ size_t sample_size,
+#else
+ size_t /* sample_size */,
+#endif
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int dst_width, int dst_height,
+ RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ if (!y || !u || !v || !sample ||
+ src_width <= 0 || dst_width <= 0 ||
+ src_height == 0 || dst_height == 0) {
+ return -1;
+ }
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (src_height < 0) {
+ inv_dst_height = -inv_dst_height;
+ }
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination y is same as source sample,
+ // also enable temporary buffer.
+ bool need_buf = (rotation && format != FOURCC_I420 &&
+ format != FOURCC_NV12 && format != FOURCC_NV21 &&
+ format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+ uint8* tmp_y = y;
+ uint8* tmp_u = u;
+ uint8* tmp_v = v;
+ int tmp_y_stride = y_stride;
+ int tmp_u_stride = u_stride;
+ int tmp_v_stride = v_stride;
+ uint8* buf = NULL;
+ int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
+ if (need_buf) {
+ int y_size = dst_width * abs_dst_height;
+ int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2);
+ buf = new uint8[y_size + uv_size * 2];
+ if (!buf) {
+ return 1; // Out of memory runtime error.
+ }
+ y = buf;
+ u = y + y_size;
+ v = u + uv_size;
+ y_stride = dst_width;
+ u_stride = v_stride = ((dst_width + 1) / 2);
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ // TODO(fbarchard): Support cropping Bayer by odd numbers
+ // by adjusting fourcc.
+ case FOURCC_BGGR:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerBGGRToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_GBRG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGBRGToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_GRBG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGRBGToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_RGGB:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerRGGBToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height, rotation);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ v, v_stride,
+ u, u_stride,
+ dst_width, inv_dst_height, rotation);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ case FOURCC_Q420:
+ src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+ src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+ src_width + crop_x * 2;
+ r = Q420ToI420(src, src_width * 3,
+ src_uv, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420Rotate(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height, rotation);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToI420(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToI420(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToI420(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_width, inv_dst_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToI420(sample, sample_size,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ src_width, abs_src_height, dst_width, inv_dst_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = I420Rotate(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ tmp_y, tmp_y_stride,
+ tmp_u, tmp_u_stride,
+ tmp_v, tmp_v_stride,
+ dst_width, abs_dst_height, rotation);
+ }
+ delete buf;
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc
new file mode 100644
index 00000000000..b4c993a2740
--- /dev/null
+++ b/chromium/third_party/libyuv/source/cpu_id.cc
@@ -0,0 +1,252 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/cpu_id.h"
+
+#ifdef _MSC_VER
+#include <intrin.h> // For __cpuid()
+#endif
+#if !defined(__CLR_VER) && defined(_M_X64) && \
+ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#include <immintrin.h> // For _xgetbv()
+#endif
+
+#include <stdlib.h> // For getenv()
+
+// For ArmCpuCaps() but unittested on all platforms
+#include <stdio.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h" // For CPU_X86
+
+// TODO(fbarchard): Consider cpu functionality for breakpoints, timer and cache.
+// arm - bkpt vs intel int 3
+
+// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile ( // NOLINT
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile ( // NOLINT
+ "cpuid \n"
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type));
+}
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Low level cpuid for X86. Returns zeros on other CPUs.
+#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
+ defined(__i386__) || defined(__x86_64__))
+LIBYUV_API
+void CpuId(int cpu_info[4], int info_type) {
+ __cpuid(cpu_info, info_type);
+}
+#else
+LIBYUV_API
+void CpuId(int cpu_info[4], int) {
+ cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
+}
+#endif
+
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+#if !defined(__CLR_VER) && defined(_M_X64) && \
+ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define HAS_XGETBV
+static uint32 XGetBV(unsigned int xcr) {
+ return static_cast<uint32>(_xgetbv(xcr));
+}
+#elif !defined(__CLR_VER) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_XGETBV
+__declspec(naked) __declspec(align(16))
+static uint32 XGetBV(unsigned int xcr) {
+ __asm {
+ mov ecx, [esp + 4] // xcr
+ push edx
+ _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // xgetbv for vs2005.
+ pop edx
+ ret
+ }
+}
+#elif defined(__i386__) || defined(__x86_64__)
+#define HAS_XGETBV
+static uint32 XGetBV(unsigned int xcr) {
+ uint32 xcr_feature_mask;
+ asm volatile ( // NOLINT
+ ".byte 0x0f, 0x01, 0xd0\n"
+ : "=a"(xcr_feature_mask)
+ : "c"(xcr)
+ : "memory", "cc", "edx"); // edx unused.
+ return xcr_feature_mask;
+}
+#endif
+#ifdef HAS_XGETBV
+static const int kXCR_XFEATURE_ENABLED_MASK = 0;
+#endif
+
+// based on libvpx arm_cpudetect.c
+// For Arm, but public to allow testing on any CPU
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name) {
+ FILE* f = fopen(cpuinfo_name, "r");
+ if (f) {
+ char buf[512];
+ while (fgets(buf, 511, f)) {
+ if (memcmp(buf, "Features", 8) == 0) {
+ char* p = strstr(buf, " neon");
+ if (p && (p[5] == ' ' || p[5] == '\n')) {
+ fclose(f);
+ return kCpuHasNEON;
+ }
+ }
+ }
+ fclose(f);
+ }
+ return 0;
+}
+
+#if defined(__mips__) && defined(__linux__)
+static int MipsCpuCaps(const char* search_string) {
+ const char* file_name = "/proc/cpuinfo";
+ char cpuinfo_line[256];
+ FILE* f = NULL;
+ if ((f = fopen(file_name, "r")) != NULL) {
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+ if (strstr(cpuinfo_line, search_string) != NULL) {
+ fclose(f);
+ return kCpuHasMIPS_DSP;
+ }
+ }
+ fclose(f);
+ }
+ /* Did not find string in the proc file, or not Linux ELF. */
+ return 0;
+}
+#endif
+
+// CPU detect function for SIMD instruction sets.
+LIBYUV_API
+int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
+
+// Test environment variable for disabling CPU features. Any non-zero value
+// to disable. Zero ignored to make it easy to set the variable on/off.
+static bool TestEnv(const char* name) {
+ const char* var = getenv(name);
+ if (var) {
+ if (var[0] != '0') {
+ return true;
+ }
+ }
+ return false;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+#if !defined(__CLR_VER) && defined(CPU_X86)
+ int cpu_info1[4] = { 0, 0, 0, 0 };
+ int cpu_info7[4] = { 0, 0, 0, 0 };
+ __cpuid(cpu_info1, 1);
+ __cpuid(cpu_info7, 7);
+ cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+ ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
+ ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
+ ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
+ ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+ kCpuHasX86;
+#ifdef HAS_XGETBV
+ if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
+ (XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06) { // Saves YMM.
+ cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+ kCpuHasAVX;
+ }
+#endif
+ // Environment variable overrides for testing.
+ if (TestEnv("LIBYUV_DISABLE_X86")) {
+ cpu_info_ &= ~kCpuHasX86;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE2")) {
+ cpu_info_ &= ~kCpuHasSSE2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
+ cpu_info_ &= ~kCpuHasSSSE3;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE41")) {
+ cpu_info_ &= ~kCpuHasSSE41;
+ }
+ if (TestEnv("LIBYUV_DISABLE_SSE42")) {
+ cpu_info_ &= ~kCpuHasSSE42;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX")) {
+ cpu_info_ &= ~kCpuHasAVX;
+ }
+ if (TestEnv("LIBYUV_DISABLE_AVX2")) {
+ cpu_info_ &= ~kCpuHasAVX2;
+ }
+ if (TestEnv("LIBYUV_DISABLE_ERMS")) {
+ cpu_info_ &= ~kCpuHasERMS;
+ }
+#elif defined(__mips__) && defined(__linux__)
+ // Linux mips parse text file for dsp detect.
+ cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
+#if defined(__mips_dspr2)
+ cpu_info_ |= kCpuHasMIPS_DSPR2;
+#endif
+ cpu_info_ |= kCpuHasMIPS;
+
+ if (getenv("LIBYUV_DISABLE_MIPS")) {
+ cpu_info_ &= ~kCpuHasMIPS;
+ }
+ if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {
+ cpu_info_ &= ~kCpuHasMIPS_DSP;
+ }
+ if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {
+ cpu_info_ &= ~kCpuHasMIPS_DSPR2;
+ }
+#elif defined(__arm__)
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+ // Linux arm parse text file for neon detect.
+ cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
+#elif defined(__ARM_NEON__)
+ // gcc -mfpu=neon defines __ARM_NEON__
+ // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
+ // to disable Neon on devices that do not have it.
+ cpu_info_ = kCpuHasNEON;
+#endif
+ cpu_info_ |= kCpuHasARM;
+ if (TestEnv("LIBYUV_DISABLE_NEON")) {
+ cpu_info_ &= ~kCpuHasNEON;
+ }
+#endif // __arm__
+ if (TestEnv("LIBYUV_DISABLE_ASM")) {
+ cpu_info_ = 0;
+ }
+ return cpu_info_;
+}
+
+LIBYUV_API
+void MaskCpuFlags(int enable_flags) {
+ cpu_info_ = InitCpuFlags() & enable_flags;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/format_conversion.cc b/chromium/third_party/libyuv/source/format_conversion.cc
new file mode 100644
index 00000000000..5b931b58773
--- /dev/null
+++ b/chromium/third_party/libyuv/source/format_conversion.cc
@@ -0,0 +1,540 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/format_conversion.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// generate a selector mask useful for pshufb
+static uint32 GenerateSelector(int select0, int select1) {
+ return static_cast<uint32>(select0) |
+ static_cast<uint32>((select1 + 4) << 8) |
+ static_cast<uint32>((select0 + 8) << 16) |
+ static_cast<uint32>((select1 + 12) << 24);
+}
+
+static int MakeSelectors(const int blue_index,
+ const int green_index,
+ const int red_index,
+ uint32 dst_fourcc_bayer,
+ uint32 *index_map) {
+ // Now build a lookup table containing the indices for the four pixels in each
+ // 2x2 Bayer grid.
+ switch (dst_fourcc_bayer) {
+ case FOURCC_BGGR:
+ index_map[0] = GenerateSelector(blue_index, green_index);
+ index_map[1] = GenerateSelector(green_index, red_index);
+ break;
+ case FOURCC_GBRG:
+ index_map[0] = GenerateSelector(green_index, blue_index);
+ index_map[1] = GenerateSelector(red_index, green_index);
+ break;
+ case FOURCC_RGGB:
+ index_map[0] = GenerateSelector(red_index, green_index);
+ index_map[1] = GenerateSelector(green_index, blue_index);
+ break;
+ case FOURCC_GRBG:
+ index_map[0] = GenerateSelector(green_index, red_index);
+ index_map[1] = GenerateSelector(blue_index, green_index);
+ break;
+ default:
+ return -1; // Bad FourCC
+ }
+ return 0;
+}
+
+// Converts 32 bit ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer) {
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_NEON;
+ }
+ }
+#endif
+ const int blue_index = 0; // Offsets for ARGB format
+ const int green_index = 1;
+ const int red_index = 2;
+ uint32 index_map[2];
+ if (MakeSelectors(blue_index, green_index, red_index,
+ dst_fourcc_bayer, index_map)) {
+ return -1; // Bad FourCC
+ }
+
+ for (int y = 0; y < height; ++y) {
+ ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
+ src_argb += src_stride_argb;
+ dst_bayer += dst_stride_bayer;
+ }
+ return 0;
+}
+
+#define AVG(a, b) (((a) + (b)) >> 1)
+
+static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 g = src_bayer0[1];
+ uint8 r = src_bayer1[1];
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = src_bayer0[0];
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = AVG(r, src_bayer1[1]);
+ dst_argb[3] = 255U;
+ dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer1[1];
+ dst_argb[7] = 255U;
+ g = src_bayer0[1];
+ r = src_bayer1[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = src_bayer0[0];
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = AVG(r, src_bayer1[1]);
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer0[0];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer1[1];
+ dst_argb[7] = 255U;
+ }
+}
+
+static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 g = src_bayer0[1];
+ uint8 b = src_bayer1[1];
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = AVG(b, src_bayer1[1]);
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = src_bayer0[0];
+ dst_argb[3] = 255U;
+ dst_argb[4] = src_bayer1[1];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[7] = 255U;
+ g = src_bayer0[1];
+ b = src_bayer1[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = AVG(b, src_bayer1[1]);
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = src_bayer0[0];
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer1[1];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer0[0];
+ dst_argb[7] = 255U;
+ }
+}
+
+static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 b = src_bayer0[1];
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = AVG(b, src_bayer0[1]);
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = src_bayer1[0];
+ dst_argb[3] = 255U;
+ dst_argb[4] = src_bayer0[1];
+ dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
+ dst_argb[7] = 255U;
+ b = src_bayer0[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = AVG(b, src_bayer0[1]);
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = src_bayer1[0];
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer0[1];
+ dst_argb[5] = src_bayer0[0];
+ dst_argb[6] = src_bayer1[0];
+ dst_argb[7] = 255U;
+ }
+}
+
+static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 r = src_bayer0[1];
+ for (int x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = src_bayer1[0];
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = AVG(r, src_bayer0[1]);
+ dst_argb[3] = 255U;
+ dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
+ dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[6] = src_bayer0[1];
+ dst_argb[7] = 255U;
+ r = src_bayer0[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = src_bayer1[0];
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = AVG(r, src_bayer0[1]);
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer1[0];
+ dst_argb[5] = src_bayer0[0];
+ dst_argb[6] = src_bayer0[1];
+ dst_argb[7] = 255U;
+ }
+}
+
+// Converts any Bayer RGB format to ARGB.
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ uint32 src_fourcc_bayer) {
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+ void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+ switch (src_fourcc_bayer) {
+ case FOURCC_BGGR:
+ BayerRow0 = BayerRowBG;
+ BayerRow1 = BayerRowGR;
+ break;
+ case FOURCC_GBRG:
+ BayerRow0 = BayerRowGB;
+ BayerRow1 = BayerRowRG;
+ break;
+ case FOURCC_GRBG:
+ BayerRow0 = BayerRowGR;
+ BayerRow1 = BayerRowBG;
+ break;
+ case FOURCC_RGGB:
+ BayerRow0 = BayerRowRG;
+ BayerRow1 = BayerRowGB;
+ break;
+ default:
+ return -1; // Bad FourCC
+ }
+
+ for (int y = 0; y < height - 1; y += 2) {
+ BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
+ BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+ dst_argb + dst_stride_argb, width);
+ src_bayer += src_stride_bayer * 2;
+ dst_argb += dst_stride_argb * 2;
+ }
+ if (height & 1) {
+ BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
+ }
+ return 0;
+}
+
+// Converts any Bayer RGB format to ARGB.
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ uint32 src_fourcc_bayer) {
+ if (width * 4 > kMaxStride) {
+ return -1; // Size too large for row buffer
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+ void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ switch (src_fourcc_bayer) {
+ case FOURCC_BGGR:
+ BayerRow0 = BayerRowBG;
+ BayerRow1 = BayerRowGR;
+ break;
+ case FOURCC_GBRG:
+ BayerRow0 = BayerRowGB;
+ BayerRow1 = BayerRowRG;
+ break;
+ case FOURCC_GRBG:
+ BayerRow0 = BayerRowGR;
+ BayerRow1 = BayerRowBG;
+ break;
+ case FOURCC_RGGB:
+ BayerRow0 = BayerRowRG;
+ BayerRow1 = BayerRowGB;
+ break;
+ default:
+ return -1; // Bad FourCC
+ }
+
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
+ for (int y = 0; y < height - 1; y += 2) {
+ BayerRow0(src_bayer, src_stride_bayer, row, width);
+ BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+ row + kMaxStride, width);
+ ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
+ src_bayer += src_stride_bayer * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ BayerRow0(src_bayer, src_stride_bayer, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert I420 to Bayer.
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer) {
+ if (width * 4 > kMaxStride) {
+ return -1; // Size too large for row buffer
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_NEON;
+ }
+ }
+#endif
+
+ const int blue_index = 0; // Offsets for ARGB format
+ const int green_index = 1;
+ const int red_index = 2;
+ uint32 index_map[2];
+ if (MakeSelectors(blue_index, green_index, red_index,
+ dst_fourcc_bayer, index_map)) {
+ return -1; // Bad FourCC
+ }
+
+ for (int y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row, width);
+ ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
+ dst_bayer += dst_stride_bayer;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+#define MAKEBAYERFOURCC(BAYER) \
+LIBYUV_API \
+int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \
+ uint8* dst_y, int dst_stride_y, \
+ uint8* dst_u, int dst_stride_u, \
+ uint8* dst_v, int dst_stride_v, \
+ int width, int height) { \
+ return BayerToI420(src_bayer, src_stride_bayer, \
+ dst_y, dst_stride_y, \
+ dst_u, dst_stride_u, \
+ dst_v, dst_stride_v, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \
+ const uint8* src_u, int src_stride_u, \
+ const uint8* src_v, int src_stride_v, \
+ uint8* dst_bayer, int dst_stride_bayer, \
+ int width, int height) { \
+ return I420ToBayer(src_y, src_stride_y, \
+ src_u, src_stride_u, \
+ src_v, src_stride_v, \
+ dst_bayer, dst_stride_bayer, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \
+ uint8* dst_bayer, int dst_stride_bayer, \
+ int width, int height) { \
+ return ARGBToBayer(src_argb, src_stride_argb, \
+ dst_bayer, dst_stride_bayer, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \
+ uint8* dst_argb, int dst_stride_argb, \
+ int width, int height) { \
+ return BayerToARGB(src_bayer, src_stride_bayer, \
+ dst_argb, dst_stride_argb, \
+ width, height, \
+ FOURCC_##BAYER); \
+}
+
+MAKEBAYERFOURCC(BGGR)
+MAKEBAYERFOURCC(GBRG)
+MAKEBAYERFOURCC(GRBG)
+MAKEBAYERFOURCC(RGGB)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/mjpeg_decoder.cc b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 00000000000..5d7296d7e73
--- /dev/null
+++ b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,540 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+#ifndef __CLR_VER
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+#endif
+struct FILE; // For jpeglib.h.
+#include <jpeglib.h>
+
+#include "libyuv/planar_functions.h" // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+ jpeg_error_mgr base; // Must be at the top
+ jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+MJpegDecoder::MJpegDecoder()
+ : has_scanline_padding_(false),
+ num_outbufs_(0),
+ scanlines_(NULL),
+ scanlines_sizes_(NULL),
+ databuf_(NULL),
+ databuf_strides_(NULL) {
+ decompress_struct_ = new jpeg_decompress_struct;
+ source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+ error_mgr_ = new SetJmpErrorMgr;
+ decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+ // Override standard exit()-based error handler.
+ error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+ decompress_struct_->client_data = NULL;
+ source_mgr_->init_source = &init_source;
+ source_mgr_->fill_input_buffer = &fill_input_buffer;
+ source_mgr_->skip_input_data = &skip_input_data;
+ source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+ source_mgr_->term_source = &term_source;
+ jpeg_create_decompress(decompress_struct_);
+ decompress_struct_->src = source_mgr_;
+ buf_vec_.buffers = &buf_;
+ buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+ jpeg_destroy_decompress(decompress_struct_);
+ delete decompress_struct_;
+ delete source_mgr_;
+#ifdef HAVE_SETJMP
+ delete error_mgr_;
+#endif
+ DestroyOutputBuffers();
+}
+
+bool MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+ if (!ValidateJpeg(src, src_len)) {
+ return false;
+ }
+
+ buf_.data = src;
+ buf_.len = static_cast<int>(src_len);
+ buf_vec_.pos = 0;
+ decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_read_header, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return false;
+ }
+#endif
+ if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+ // ERROR: Bad MJPEG header
+ return false;
+ }
+ AllocOutputBuffers(GetNumComponents());
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+ if (scanlines_sizes_[i] != scanlines_size) {
+ if (scanlines_[i]) {
+ delete scanlines_[i];
+ }
+ scanlines_[i] = new uint8* [scanlines_size];
+ scanlines_sizes_[i] = scanlines_size;
+ }
+
+ // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+ // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+ // the preceding scanlines, the padding is not needed/wanted because the
+ // following addresses will already be valid (they are the initial bytes of
+ // the next scanline) and will be overwritten when jpeglib writes out that
+ // next scanline.
+ int databuf_stride = GetComponentStride(i);
+ int databuf_size = scanlines_size * databuf_stride;
+ if (databuf_strides_[i] != databuf_stride) {
+ if (databuf_[i]) {
+ delete databuf_[i];
+ }
+ databuf_[i] = new uint8[databuf_size];
+ databuf_strides_[i] = databuf_stride;
+ }
+
+ if (GetComponentStride(i) != GetComponentWidth(i)) {
+ has_scanline_padding_ = true;
+ }
+ }
+ return true;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+ return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+ return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+ return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+ return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+ return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+ return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+ return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+ return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+ return decompress_struct_->max_h_samp_factor /
+ GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+ return decompress_struct_->max_v_samp_factor /
+ GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+ return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+ int hs = GetHorizSubSampFactor(component);
+ return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+ return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+ return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+bool MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_abort_decompress, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return false;
+ }
+#endif
+ jpeg_abort_decompress(decompress_struct_);
+ return true;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+bool MJpegDecoder::DecodeToBuffers(
+ uint8** planes, int dst_width, int dst_height) {
+ if (dst_width != GetWidth() ||
+ dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return false;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return false;
+ }
+#endif
+ if (!StartDecode()) {
+ return false;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // Compute amount of lines to skip to implement vertical crop.
+ // TODO(fbarchard): Ensure skip is a multiple of maximum component
+ // subsample. ie 2
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ // There is no API to skip lines in the output data, so we read them
+ // into the temp buffer.
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip. Must read it and then
+ // copy the parts we want into the destination.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip =
+ DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+ rows_to_skip;
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+ }
+ }
+
+ // Read full MCUs but cropped horizontally
+ for (; lines_left > GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+ CopyPlane(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy =
+ DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+ CopyPlane(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+ return FinishDecode();
+}
+
+bool MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+ int dst_width, int dst_height) {
+ if (dst_width != GetWidth() ||
+ dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return false;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return false;
+ }
+#endif
+ if (!StartDecode()) {
+ return false;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ // Change our own data buffer pointers so we can pass them to the
+ // callback.
+ databuf_[i] += data_to_skip;
+ }
+ int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+ (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+ // Now change them back.
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ databuf_[i] -= data_to_skip;
+ }
+ lines_left -= scanlines_to_copy;
+ }
+ }
+ // Read full MCUs until we get to the crop point.
+ for (; lines_left >= GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+ }
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return false;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+ }
+ return FinishDecode();
+}
+
+void MJpegDecoder::init_source(j_decompress_ptr cinfo) {
+ fill_input_buffer(cinfo);
+}
+
+boolean MJpegDecoder::fill_input_buffer(j_decompress_ptr cinfo) {
+ BufferVector* buf_vec = static_cast<BufferVector*>(cinfo->client_data);
+ if (buf_vec->pos >= buf_vec->len) {
+ assert(0 && "No more data");
+ // ERROR: No more data
+ return FALSE;
+ }
+ cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+ cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+ ++buf_vec->pos;
+ return TRUE;
+}
+
+void MJpegDecoder::skip_input_data(j_decompress_ptr cinfo,
+ long num_bytes) { // NOLINT
+ cinfo->src->next_input_byte += num_bytes;
+}
+
+void MJpegDecoder::term_source(j_decompress_ptr cinfo) {
+ // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
+ // This is called when a jpeglib command experiences an error. Unfortunately
+ // jpeglib's error handling model is not very flexible, because it expects the
+ // error handler to not return--i.e., it wants the program to terminate. To
+ // recover from errors we use setjmp() as shown in their example. setjmp() is
+ // C's implementation for the "call with current continuation" functionality
+ // seen in some functional programming languages.
+ char buf[JMSG_LENGTH_MAX];
+ (*cinfo->err->format_message)(cinfo, buf);
+ // ERROR: Error in jpeglib: buf
+
+ SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
+ // This rewinds the call stack to the point of the corresponding setjmp()
+ // and causes it to return (for a second time) with value 1.
+ longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+ if (num_outbufs != num_outbufs_) {
+ // We could perhaps optimize this case to resize the output buffers without
+ // necessarily having to delete and recreate each one, but it's not worth
+ // it.
+ DestroyOutputBuffers();
+
+ scanlines_ = new uint8** [num_outbufs];
+ scanlines_sizes_ = new int[num_outbufs];
+ databuf_ = new uint8* [num_outbufs];
+ databuf_strides_ = new int[num_outbufs];
+
+ for (int i = 0; i < num_outbufs; ++i) {
+ scanlines_[i] = NULL;
+ scanlines_sizes_[i] = 0;
+ databuf_[i] = NULL;
+ databuf_strides_[i] = 0;
+ }
+
+ num_outbufs_ = num_outbufs;
+ }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ delete [] scanlines_[i];
+ delete [] databuf_[i];
+ }
+ delete [] scanlines_;
+ delete [] databuf_;
+ delete [] scanlines_sizes_;
+ delete [] databuf_strides_;
+ scanlines_ = NULL;
+ databuf_ = NULL;
+ scanlines_sizes_ = NULL;
+ databuf_strides_ = NULL;
+ num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+bool MJpegDecoder::StartDecode() {
+ decompress_struct_->raw_data_out = TRUE;
+ decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
+ decompress_struct_->dither_mode = JDITHER_NONE;
+ decompress_struct_->do_fancy_upsampling = false; // Not applicable to 'raw'
+ decompress_struct_->enable_2pass_quant = false; // Only for buffered mode
+ decompress_struct_->do_block_smoothing = false; // blocky but fast
+
+ if (!jpeg_start_decompress(decompress_struct_)) {
+ // ERROR: Couldn't start JPEG decompressor";
+ return false;
+ }
+ return true;
+}
+
+bool MJpegDecoder::FinishDecode() {
+ // jpeglib considers it an error if we finish without decoding the whole
+ // image, so we call "abort" rather than "finish".
+ jpeg_abort_decompress(decompress_struct_);
+ return true;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ uint8* data_i = data[i];
+ for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+ scanlines_[i][j] = data_i;
+ data_i += GetComponentStride(i);
+ }
+ }
+}
+
+inline bool MJpegDecoder::DecodeImcuRow() {
+ return static_cast<unsigned int>(GetImageScanlinesPerImcuRow()) ==
+ jpeg_read_raw_data(decompress_struct_,
+ scanlines_,
+ GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+ int* subsample_x, int* subsample_y, int number_of_components) {
+ if (number_of_components == 3) { // Color images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 2 && subsample_y[1] == 2 &&
+ subsample_x[2] == 2 && subsample_y[2] == 2) {
+ return kJpegYuv420;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 2 && subsample_y[1] == 1 &&
+ subsample_x[2] == 2 && subsample_y[2] == 1) {
+ return kJpegYuv422;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 1 && subsample_y[1] == 1 &&
+ subsample_x[2] == 1 && subsample_y[2] == 1) {
+ return kJpegYuv444;
+ }
+ } else if (number_of_components == 1) { // Grey-scale images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+ return kJpegYuv400;
+ }
+ }
+ return kJpegUnknown;
+}
+
+} // namespace libyuv
+#endif // HAVE_JPEG
+
diff --git a/chromium/third_party/libyuv/source/mjpeg_validate.cc b/chromium/third_party/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 00000000000..1d0aeebd1f3
--- /dev/null
+++ b/chromium/third_party/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+namespace libyuv {
+
+// Helper function to validate the jpeg appears intact.
+// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
+bool ValidateJpeg(const uint8* sample, size_t sample_size) {
+ if (sample_size < 64) {
+ // ERROR: Invalid jpeg size: sample_size
+ return false;
+ }
+ if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image
+ // ERROR: Invalid jpeg initial start code
+ return false;
+ }
+ for (int i = static_cast<int>(sample_size) - 2; i > 1;) {
+ if (sample[i] != 0xd9) {
+ if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image
+ return true;
+ }
+ --i;
+ }
+ --i;
+ }
+ // ERROR: Invalid jpeg end code not found. Size sample_size
+ return false;
+}
+
+} // namespace libyuv
+
+
diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc
new file mode 100644
index 00000000000..2f70331327c
--- /dev/null
+++ b/chromium/third_party/libyuv/source/planar_functions.cc
@@ -0,0 +1,1993 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/planar_functions.h"
+
+#include <string.h> // for memset()
+
+#include "libyuv/cpu_id.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy a plane of data
+LIBYUV_API
+void CopyPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ CopyPlane(src_y, 0,
+ dst_y, 0,
+ width * height, 1);
+ return;
+ }
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (int y = 0; y < height; ++y) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Copy I422.
+LIBYUV_API
+int I422Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ int halfwidth = (width + 1) >> 1;
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);
+ return 0;
+}
+
+// Copy I444.
+LIBYUV_API
+int I444Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (height - 1) * src_stride_u;
+ src_v = src_v + (height - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+ return 0;
+}
+
+// Copy I400.
+LIBYUV_API
+int I400ToI400(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Convert I420 to I400.
+LIBYUV_API
+int I420ToI400(const uint8* src_y, int src_stride_y,
+ uint8*, int, // src_u
+ uint8*, int, // src_v
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Mirror a plane of data
+void MirrorPlane(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+#endif
+
+ // Mirror plane
+ for (int y = 0; y < height; ++y) {
+ MirrorRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+ }
+}
+
+// Convert YUY2 to I422.
+LIBYUV_API
+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_yuy2 == width * 2 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ return YUY2ToI422(src_yuy2, 0,
+ dst_y, 0,
+ dst_u, 0,
+ dst_v, 0,
+ width * height, 1);
+ }
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*YUY2ToYRow)(const uint8* src_yuy2,
+ uint8* dst_y, int pix);
+ YUY2ToYRow = YUY2ToYRow_C;
+ YUY2ToUV422Row = YUY2ToUV422Row_C;
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Convert UYVY to I422.
+LIBYUV_API
+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_uyvy == width * 2 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ return UYVYToI422(src_uyvy, 0,
+ dst_y, 0,
+ dst_u, 0,
+ dst_v, 0,
+ width * height, 1);
+ }
+ void (*UYVYToUV422Row)(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix);
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix);
+ UYVYToYRow = UYVYToYRow_C;
+ UYVYToUV422Row = UYVYToUV422Row_C;
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+ UYVYToUV422Row = UYVYToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUV422Row = UYVYToUV422Row_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width >= 16) {
+ UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUV422Row = UYVYToUV422Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ src_uyvy += src_stride_uyvy;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// Mirror I400 with optional flipping
+LIBYUV_API
+int I400Mirror(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+// Mirror I420 with optional flipping
+LIBYUV_API
+int I420Mirror(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ int halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (dst_y) {
+ MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// ARGB mirror.
+LIBYUV_API
+int ARGBMirror(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+#endif
+
+ // Mirror plane
+ for (int y = 0; y < height; ++y) {
+ ARGBMirrorRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Get a blender that optimized for the CPU, alignment and pixel count.
+// As there are 6 blenders to choose from, the caller should try to use
+// the same blend function for all pixels if possible.
+LIBYUV_API
+ARGBBlendRow GetARGBBlend() {
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = ARGBBlendRow_C;
+#if defined(HAS_ARGBBLENDROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ARGBBlendRow = ARGBBlendRow_SSSE3;
+ return ARGBBlendRow;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBBlendRow = ARGBBlendRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBBLENDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ ARGBBlendRow = ARGBBlendRow_NEON;
+ }
+#endif
+ return ARGBBlendRow;
+}
+
+// Alpha Blend 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBBlend(src_argb0, 0,
+ src_argb1, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
+ uint8* dst_argb, int width) = GetARGBBlend();
+
+ for (int y = 0; y < height; ++y) {
+ ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Multiply 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBMultiply(src_argb0, 0,
+ src_argb1, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+
+ void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBMultiplyRow_C;
+#if defined(HAS_ARGBMULTIPLYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBMULTIPLYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBMultiplyRow = ARGBMultiplyRow_NEON;
+ }
+ }
+#endif
+
+ // Multiply plane
+ for (int y = 0; y < height; ++y) {
+ ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Add 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBAdd(src_argb0, 0,
+ src_argb1, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+
+ void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBAddRow_C;
+#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+#endif
+#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBAddRow = ARGBAddRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAddRow = ARGBAddRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBAddRow = ARGBAddRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBADDROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBAddRow = ARGBAddRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAddRow = ARGBAddRow_NEON;
+ }
+ }
+#endif
+
+ // Add plane
+ for (int y = 0; y < height; ++y) {
+ ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Subtract 2 ARGB images and store to destination.
+LIBYUV_API
+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBSubtract(src_argb0, 0,
+ src_argb1, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+
+ void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
+ int width) = ARGBSubtractRow_C;
+#if defined(HAS_ARGBSUBTRACTROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBSubtractRow = ARGBSubtractRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSUBTRACTROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBSubtractRow = ARGBSubtractRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBSubtractRow = ARGBSubtractRow_NEON;
+ }
+ }
+#endif
+
+ // Subtract plane
+ for (int y = 0; y < height; ++y) {
+ ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert I422 to BGRA.
+LIBYUV_API
+int I422ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_bgra ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_bgra == width * 4) {
+ return I422ToBGRA(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_bgra, 0,
+ width * height, 1);
+ }
+ void (*I422ToBGRARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToBGRARow_C;
+#if defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToBGRARow = I422ToBGRARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+ I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ABGR.
+LIBYUV_API
+int I422ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_abgr == width * 4) {
+ return I422ToABGR(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_abgr, 0,
+ width * height, 1);
+ }
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+#if defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to RGBA.
+LIBYUV_API
+int I422ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ if (!src_y || !src_u || !src_v ||
+ !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_rgba == width * 4) {
+ return I422ToRGBA(src_y, 0,
+ src_u, 0,
+ src_v, 0,
+ dst_rgba, 0,
+ width * height, 1);
+ }
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+#if defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert NV12 to RGB565.
+LIBYUV_API
+int NV12ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ if (!src_y || !src_uv || !dst_rgb565 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ void (*NV12ToRGB565Row)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToRGB565Row_C;
+#if defined(HAS_NV12TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width * 4 <= kMaxStride) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;
+ }
+ }
+#elif defined(HAS_NV12TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToRGB565Row = NV12ToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to RGB565.
+LIBYUV_API
+int NV21ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ if (!src_y || !src_vu || !dst_rgb565 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+ void (*NV21ToRGB565Row)(const uint8* y_buf,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ int width) = NV21ToRGB565Row_C;
+#if defined(HAS_NV21TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width * 4 <= kMaxStride) {
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;
+ }
+ }
+#elif defined(HAS_NV21TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToRGB565Row = NV21ToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_vu += src_stride_vu;
+ }
+ }
+ return 0;
+}
+
+LIBYUV_API
+void SetPlane(uint8* dst_y, int dst_stride_y,
+ int width, int height,
+ uint32 value) {
+ // Coalesce contiguous rows.
+ if (dst_stride_y == width) {
+ SetPlane(dst_y, 0,
+ width * height, 1,
+ value);
+ return;
+ }
+ void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ SetRow = SetRow_NEON;
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ SetRow = SetRow_X86;
+ }
+#endif
+
+ uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
+ // Set plane
+ for (int y = 0; y < height; ++y) {
+ SetRow(dst_y, v32, width);
+ dst_y += dst_stride_y;
+ }
+}
+
+// Draw a rectangle into I420
+LIBYUV_API
+int I420Rect(uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int x, int y,
+ int width, int height,
+ int value_y, int value_u, int value_v) {
+ if (!dst_y || !dst_u || !dst_v ||
+ width <= 0 || height <= 0 ||
+ x < 0 || y < 0 ||
+ value_y < 0 || value_y > 255 ||
+ value_u < 0 || value_u > 255 ||
+ value_v < 0 || value_v > 255) {
+ return -1;
+ }
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ uint8* start_y = dst_y + y * dst_stride_y + x;
+ uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+ uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+
+ SetPlane(start_y, dst_stride_y, width, height, value_y);
+ SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
+ SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
+ return 0;
+}
+
+// Draw a rectangle into ARGB
+LIBYUV_API
+int ARGBRect(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y,
+ int width, int height,
+ uint32 value) {
+ if (!dst_argb ||
+ width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce contiguous rows.
+ if (dst_stride_argb == width * 4) {
+ return ARGBRect(dst_argb, dst_stride_argb,
+ dst_x, dst_y,
+ width * height, 1, value);
+ }
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+#if defined(HAS_SETROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBSetRows_NEON(dst, value, width, dst_stride_argb, height);
+ return 0;
+ }
+#endif
+#if defined(HAS_SETROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBSetRows_X86(dst, value, width, dst_stride_argb, height);
+ return 0;
+ }
+#endif
+ ARGBSetRows_C(dst, value, width, dst_stride_argb, height);
+ return 0;
+}
+
+// Convert unattentuated ARGB to preattenuated ARGB.
+// An unattenutated ARGB alpha blend uses the formula
+// p = a * f + (1 - a) * b
+// where
+// p is output pixel
+// f is foreground pixel
+// b is background pixel
+// a is alpha value from foreground pixel
+// An preattenutated ARGB alpha blend uses the formula
+// p = f + (1 - a) * b
+// where
+// f is foreground pixel premultiplied by alpha
+
+LIBYUV_API
+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBAttenuate(src_argb, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBAttenuateRow_C;
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBAttenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert preattentuated ARGB to unattenuated ARGB.
+LIBYUV_API
+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBUnattenuate(src_argb, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBUnattenuateRow_C;
+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
+ }
+ }
+#endif
+// TODO(fbarchard): Neon version.
+
+ for (int y = 0; y < height; ++y) {
+ ARGBUnattenuateRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB to Grayed ARGB.
+LIBYUV_API
+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBGrayTo(src_argb, 0,
+ dst_argb, 0,
+ width * height, 1);
+ }
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#elif defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBGrayRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB gray scale.
+LIBYUV_API
+int ARGBGray(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y,
+ int width, int height) {
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce contiguous rows.
+ if (dst_stride_argb == width * 4) {
+ return ARGBGray(dst_argb, dst_stride_argb,
+ dst_x, dst_y,
+ width * height, 1);
+ }
+ void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
+ int width) = ARGBGrayRow_C;
+#if defined(HAS_ARGBGRAYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBGrayRow = ARGBGrayRow_SSSE3;
+ }
+#elif defined(HAS_ARGBGRAYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBGrayRow = ARGBGrayRow_NEON;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBGrayRow(dst, dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Make a rectangle of ARGB Sepia tone.
+LIBYUV_API
+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce contiguous rows.
+ if (dst_stride_argb == width * 4) {
+ return ARGBSepia(dst_argb, dst_stride_argb,
+ dst_x, dst_y,
+ width * height, 1);
+ }
+ void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
+#if defined(HAS_ARGBSEPIAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBSepiaRow = ARGBSepiaRow_SSSE3;
+ }
+#elif defined(HAS_ARGBSEPIAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBSepiaRow = ARGBSepiaRow_NEON;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBSepiaRow(dst, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a 4x3 matrix rotation to each ARGB pixel.
+LIBYUV_API
+int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_argb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce contiguous rows.
+ if (dst_stride_argb == width * 4) {
+ return ARGBColorMatrix(dst_argb, dst_stride_argb,
+ matrix_argb,
+ dst_x, dst_y,
+ width * height, 1);
+ }
+ void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
+ int width) = ARGBColorMatrixRow_C;
+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;
+ }
+#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBColorMatrixRow(dst, matrix_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a color table each ARGB pixel.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce contiguous rows.
+ if (dst_stride_argb == width * 4) {
+ return ARGBColorTable(dst_argb, dst_stride_argb,
+ table_argb,
+ dst_x, dst_y,
+ width * height, 1);
+ }
+ void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ int width) = ARGBColorTableRow_C;
+#if defined(HAS_ARGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ ARGBColorTableRow = ARGBColorTableRow_X86;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// ARGBQuantize is used to posterize art.
+// e.g. rgb / qvalue * qvalue + qvalue / 2
+// But the low levels implement efficiently with 3 parameters, and could be
+// used for other high level operations.
+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+// where scale is 1 / interval_size as a fixed point value.
+// The divide is replaces with a multiply by reciprocal fixed point multiply.
+// Caveat - although SSE2 saturates, the C function does not and should be used
+// with care if doing anything but quantization.
+LIBYUV_API
+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
+ int scale, int interval_size, int interval_offset,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
+ interval_size < 1 || interval_size > 255) {
+ return -1;
+ }
+ // Coalesce contiguous rows.
+ if (dst_stride_argb == width * 4) {
+ return ARGBQuantize(dst_argb, dst_stride_argb,
+ scale, interval_size, interval_offset,
+ dst_x, dst_y,
+ width * height, 1);
+ }
+ void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) = ARGBQuantizeRow_C;
+#if defined(HAS_ARGBQUANTIZEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_SSE2;
+ }
+#elif defined(HAS_ARGBQUANTIZEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBQuantizeRow = ARGBQuantizeRow_NEON;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Computes table of cumulative sum for image where the value is the sum
+// of all values above and to the left of the entry. Used by ARGBBlur.
+LIBYUV_API
+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height) {
+ if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
+ return -1;
+ }
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ }
+#endif
+ memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel.
+ int32* previous_cumsum = dst_cumsum;
+ for (int y = 0; y < height; ++y) {
+ ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);
+ previous_cumsum = dst_cumsum;
+ dst_cumsum += dst_stride32_cumsum;
+ src_argb += src_stride_argb;
+ }
+ return 0;
+}
+
+// Blur ARGB image.
+// Caller should allocate CumulativeSum table of width * height * 16 bytes
+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
+// as the buffer is treated as circular.
+LIBYUV_API
+int ARGBBlur(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int32* dst_cumsum, int dst_stride32_cumsum,
+ int width, int height, int radius) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
+ void (*CUMULATIVESUMTOAVERAGEROW)(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
+ CUMULATIVESUMTOAVERAGEROW = CumulativeSumToAverageRow_SSE2;
+ }
+#endif
+ // Compute enough CumulativeSum for first row to be blurred. After this
+ // one row of CumulativeSum is updated at a time.
+ ARGBComputeCumulativeSum(src_argb, src_stride_argb,
+ dst_cumsum, dst_stride32_cumsum,
+ width, radius);
+
+ src_argb = src_argb + radius * src_stride_argb;
+ int32* cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
+
+ const int32* max_cumsum_bot_row =
+ &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];
+ const int32* cumsum_top_row = &dst_cumsum[0];
+
+ for (int y = 0; y < height; ++y) {
+ int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;
+ int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);
+ int area = radius * (bot_y - top_y);
+
+ // Increment cumsum_top_row pointer with circular buffer wrap around.
+ if (top_y) {
+ cumsum_top_row += dst_stride32_cumsum;
+ if (cumsum_top_row >= max_cumsum_bot_row) {
+ cumsum_top_row = dst_cumsum;
+ }
+ }
+ // Increment cumsum_bot_row pointer with circular buffer wrap around and
+ // then fill in a row of CumulativeSum.
+ if ((y + radius) < height) {
+ const int32* prev_cumsum_bot_row = cumsum_bot_row;
+ cumsum_bot_row += dst_stride32_cumsum;
+ if (cumsum_bot_row >= max_cumsum_bot_row) {
+ cumsum_bot_row = dst_cumsum;
+ }
+ ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,
+ width);
+ src_argb += src_stride_argb;
+ }
+
+ // Left clipped.
+ int boxwidth = radius * 4;
+ int x;
+ for (x = 0; x < radius + 1; ++x) {
+ CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], 1);
+ area += (bot_y - top_y);
+ boxwidth += 4;
+ }
+
+ // Middle unclipped.
+ int n = (width - 1) - radius - x + 1;
+ CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], n);
+
+ // Right clipped.
+ for (x += n; x <= width - 1; ++x) {
+ area -= (bot_y - top_y);
+ boxwidth -= 4;
+ CUMULATIVESUMTOAVERAGEROW(cumsum_top_row + (x - radius - 1) * 4,
+ cumsum_bot_row + (x - radius - 1) * 4,
+ boxwidth, area, &dst_argb[x * 4], 1);
+ }
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Multiply ARGB image by a specified ARGB value.
+LIBYUV_API
+int ARGBShade(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, uint32 value) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBShade(src_argb, 0,
+ dst_argb, 0,
+ width * height, 1,
+ value);
+ }
+ void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
+ int width, uint32 value) = ARGBShadeRow_C;
+#if defined(HAS_ARGBSHADEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBShadeRow = ARGBShadeRow_SSE2;
+ }
+#elif defined(HAS_ARGBSHADEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ ARGBShadeRow = ARGBShadeRow_NEON;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBShadeRow(src_argb, dst_argb, width, value);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Interpolate 2 ARGB images by specified amount (0 to 255).
+// TODO(fbarchard): Consider selecting a specialization for interpolation so
+// row function doesn't need to check interpolation on each row.
+LIBYUV_API
+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
+ const uint8* src_argb1, int src_stride_argb1,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height, int interpolation) {
+ if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_argb0 == width * 4 &&
+ src_stride_argb1 == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBInterpolate(src_argb0, 0,
+ src_argb1, 0,
+ dst_argb, 0,
+ width * height, 1,
+ interpolation);
+ }
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) = InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+ IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
+ IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&
+ IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&
+ IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,
+ width * 4, interpolation);
+ src_argb0 += src_stride_argb0;
+ src_argb1 += src_stride_argb1;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Shuffle ARGB channel order. e.g. BGRA to ARGB.
+LIBYUV_API
+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* shuffler, int width, int height) {
+ if (!src_bgra || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+ // Coalesce contiguous rows.
+ if (src_stride_bgra == width * 4 &&
+ dst_stride_argb == width * 4) {
+ return ARGBShuffle(src_bgra, 0,
+ dst_argb, 0,
+ shuffler,
+ width * height, 1);
+ }
+ void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
+ const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBShuffleRow = ARGBShuffleRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBSHUFFLEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_NEON;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_NEON;
+ }
+ }
+#endif
+
+ for (int y = 0; y < height; ++y) {
+ ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
+ src_bgra += src_stride_bgra;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb ||
+ width <= 0 || height == 0 || width > (kMaxStride / 4)) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // ARGBToBayer used to select G channel from ARGB.
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_NEON;
+ }
+ }
+#endif
+ void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) = SobelYRow_C;
+#if defined(HAS_SOBELYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SobelYRow = SobelYRow_SSSE3;
+ }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelYRow = SobelYRow_NEON;
+ }
+#endif
+ void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobely, int width) =
+ SobelXRow_C;
+#if defined(HAS_SOBELXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SobelXRow = SobelXRow_SSSE3;
+ }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXRow = SobelXRow_NEON;
+ }
+#endif
+ void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ SobelRow = SobelRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_NEON;
+ }
+#endif
+
+ const int kEdge = 16; // Extra pixels at start of row for extrude/align.
+ SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
+ SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]);
+ SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]);
+
+ // Convert first row.
+ uint8* row_y0 = row_y + kEdge;
+ uint8* row_y1 = row_y0 + kMaxStride / 4;
+ uint8* row_y2 = row_y1 + kMaxStride / 4;
+ ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
+ row_y0[-1] = row_y0[0];
+ row_y0[width] = row_y0[width - 1];
+ ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
+ row_y1[-1] = row_y1[0];
+ row_y1[width] = row_y1[width - 1];
+
+ for (int y = 0; y < height; ++y) {
+ // Convert next row of ARGB to Y.
+ if (y < (height - 1)) {
+ src_argb += src_stride_argb;
+ }
+ ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
+ row_y2[-1] = row_y2[0];
+ row_y2[width] = row_y2[width - 1];
+
+ SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+ SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+ SobelRow(row_sobelx, row_sobely, dst_argb, width);
+
+ // Cycle thru circular queue of 3 row_y buffers.
+ uint8* row_yt = row_y0;
+ row_y0 = row_y1;
+ row_y1 = row_y2;
+ row_y2 = row_yt;
+
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// SobelXY ARGB effect.
+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
+LIBYUV_API
+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb ||
+ width <= 0 || height == 0 || width > kMaxStride / 4) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // ARGBToBayer used to select G channel from ARGB.
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_NEON;
+ }
+ }
+#endif
+ void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) = SobelYRow_C;
+#if defined(HAS_SOBELYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SobelYRow = SobelYRow_SSSE3;
+ }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelYRow = SobelYRow_NEON;
+ }
+#endif
+ void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobely, int width) =
+ SobelXRow_C;
+#if defined(HAS_SOBELXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SobelXRow = SobelXRow_SSSE3;
+ }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXRow = SobelXRow_NEON;
+ }
+#endif
+ void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ SobelXYRow = SobelXYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_NEON;
+ }
+#endif
+
+ const int kEdge = 16; // Extra pixels at start of row for extrude/align.
+ SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
+ SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]);
+ SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]);
+
+ // Convert first row.
+ uint8* row_y0 = row_y + kEdge;
+ uint8* row_y1 = row_y0 + kMaxStride / 4;
+ uint8* row_y2 = row_y1 + kMaxStride / 4;
+ ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
+ row_y0[-1] = row_y0[0];
+ row_y0[width] = row_y0[width - 1];
+ ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
+ row_y1[-1] = row_y1[0];
+ row_y1[width] = row_y1[width - 1];
+
+ for (int y = 0; y < height; ++y) {
+ // Convert next row of ARGB to Y.
+ if (y < (height - 1)) {
+ src_argb += src_stride_argb;
+ }
+ ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
+ row_y2[-1] = row_y2[0];
+ row_y2[width] = row_y2[width - 1];
+
+ SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
+ SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
+ SobelXYRow(row_sobelx, row_sobely, dst_argb, width);
+
+ // Cycle thru circular queue of 3 row_y buffers.
+ uint8* row_yt = row_y0;
+ row_y0 = row_y1;
+ row_y1 = row_y2;
+ row_y2 = row_yt;
+
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/rotate.cc b/chromium/third_party/libyuv/source/rotate.cc
new file mode 100644
index 00000000000..c46650b4458
--- /dev/null
+++ b/chromium/third_party/libyuv/source/rotate.cc
@@ -0,0 +1,1283 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".private_extern _" #name " \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
+#else
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+#name ": \n"
+#endif
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+#define HAS_MIRRORROW_UV_NEON
+void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
+#define HAS_TRANSPOSE_WX8_NEON
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWX8_NEON
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width);
+#endif // defined(__ARM_NEON__)
+
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width);
+#endif // defined(__mips__)
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_TRANSPOSE_WX8_SSSE3
+__declspec(naked) __declspec(align(16))
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ __asm {
+ push edi
+ push esi
+ push ebp
+ mov eax, [esp + 12 + 4] // src
+ mov edi, [esp + 12 + 8] // src_stride
+ mov edx, [esp + 12 + 12] // dst
+ mov esi, [esp + 12 + 16] // dst_stride
+ mov ecx, [esp + 12 + 20] // width
+
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ align 16
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea ebp, [eax + 8]
+ movq xmm1, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm0, xmm1
+ movq xmm2, qword ptr [eax]
+ movdqa xmm1, xmm0
+ palignr xmm1, xmm1, 8
+ movq xmm3, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm2, xmm3
+ movdqa xmm3, xmm2
+ movq xmm4, qword ptr [eax]
+ palignr xmm3, xmm3, 8
+ movq xmm5, qword ptr [eax + edi]
+ punpcklbw xmm4, xmm5
+ lea eax, [eax + 2 * edi]
+ movdqa xmm5, xmm4
+ movq xmm6, qword ptr [eax]
+ palignr xmm5, xmm5, 8
+ movq xmm7, qword ptr [eax + edi]
+ punpcklbw xmm6, xmm7
+ mov eax, ebp
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm7, 8
+ // Second round of bit swap.
+ punpcklwd xmm0, xmm2
+ punpcklwd xmm1, xmm3
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ palignr xmm2, xmm2, 8
+ palignr xmm3, xmm3, 8
+ punpcklwd xmm4, xmm6
+ punpcklwd xmm5, xmm7
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ palignr xmm6, xmm6, 8
+ palignr xmm7, xmm7, 8
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ punpckldq xmm0, xmm4
+ movq qword ptr [edx], xmm0
+ movdqa xmm4, xmm0
+ palignr xmm4, xmm4, 8
+ movq qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ punpckldq xmm2, xmm6
+ movdqa xmm6, xmm2
+ palignr xmm6, xmm6, 8
+ movq qword ptr [edx], xmm2
+ punpckldq xmm1, xmm5
+ movq qword ptr [edx + esi], xmm6
+ lea edx, [edx + 2 * esi]
+ movdqa xmm5, xmm1
+ movq qword ptr [edx], xmm1
+ palignr xmm5, xmm5, 8
+ punpckldq xmm3, xmm7
+ movq qword ptr [edx + esi], xmm5
+ lea edx, [edx + 2 * esi]
+ movq qword ptr [edx], xmm3
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm7, 8
+ sub ecx, 8
+ movq qword ptr [edx + esi], xmm7
+ lea edx, [edx + 2 * esi]
+ jg convertloop
+
+ pop ebp
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+__declspec(naked) __declspec(align(16))
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int w) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 16 + 4] // src
+ mov edi, [esp + 16 + 8] // src_stride
+ mov edx, [esp + 16 + 12] // dst_a
+ mov esi, [esp + 16 + 16] // dst_stride_a
+ mov ebx, [esp + 16 + 20] // dst_b
+ mov ebp, [esp + 16 + 24] // dst_stride_b
+ mov ecx, esp
+ sub esp, 4 + 16
+ and esp, ~15
+ mov [esp + 16], ecx
+ mov ecx, [ecx + 16 + 28] // w
+
+ align 16
+ convertloop:
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm0 // use xmm7 as temp register.
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm7, xmm1
+ movdqa xmm1, xmm7
+ movdqa xmm2, [eax]
+ movdqa xmm3, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm2
+ punpcklbw xmm2, xmm3
+ punpckhbw xmm7, xmm3
+ movdqa xmm3, xmm7
+ movdqa xmm4, [eax]
+ movdqa xmm5, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm4
+ punpcklbw xmm4, xmm5
+ punpckhbw xmm7, xmm5
+ movdqa xmm5, xmm7
+ movdqa xmm6, [eax]
+ movdqa xmm7, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa [esp], xmm5 // backup xmm5
+ neg edi
+ movdqa xmm5, xmm6 // use xmm5 as temp register.
+ punpcklbw xmm6, xmm7
+ punpckhbw xmm5, xmm7
+ movdqa xmm7, xmm5
+ lea eax, [eax + 8 * edi + 16]
+ neg edi
+ // Second round of bit swap.
+ movdqa xmm5, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm5, xmm2
+ movdqa xmm2, xmm5
+ movdqa xmm5, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm5, xmm3
+ movdqa xmm3, xmm5
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm5, xmm6
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp] // restore xmm5
+ movdqa [esp], xmm6 // backup xmm6
+ movdqa xmm6, xmm5 // use xmm6 as temp register.
+ punpcklwd xmm5, xmm7
+ punpckhwd xmm6, xmm7
+ movdqa xmm7, xmm6
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ movdqa xmm6, xmm0
+ punpckldq xmm0, xmm4
+ punpckhdq xmm6, xmm4
+ movdqa xmm4, xmm6
+ movdqa xmm6, [esp] // restore xmm6
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [ebx], xmm0
+ movlpd qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm4
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm2 // use xmm0 as the temp register.
+ punpckldq xmm2, xmm6
+ movlpd qword ptr [edx], xmm2
+ movhpd qword ptr [ebx], xmm2
+ punpckhdq xmm0, xmm6
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm1 // use xmm0 as the temp register.
+ punpckldq xmm1, xmm5
+ movlpd qword ptr [edx], xmm1
+ movhpd qword ptr [ebx], xmm1
+ punpckhdq xmm0, xmm5
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm3 // use xmm0 as the temp register.
+ punpckldq xmm3, xmm7
+ movlpd qword ptr [edx], xmm3
+ movhpd qword ptr [ebx], xmm3
+ punpckhdq xmm0, xmm7
+ sub ecx, 8
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ jg convertloop
+
+ mov esp, [esp + 16]
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+#elif !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
+#define HAS_TRANSPOSE_WX8_SSSE3
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "r"(static_cast<intptr_t>(dst_stride)) // %4
+ : "memory", "cc"
+ #if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ #endif
+ );
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined (__i386__)
+#define HAS_TRANSPOSE_UVWX8_SSE2
+extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int w);
+ asm (
+ DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+ "push %ebx \n"
+ "push %esi \n"
+ "push %edi \n"
+ "push %ebp \n"
+ "mov 0x14(%esp),%eax \n"
+ "mov 0x18(%esp),%edi \n"
+ "mov 0x1c(%esp),%edx \n"
+ "mov 0x20(%esp),%esi \n"
+ "mov 0x24(%esp),%ebx \n"
+ "mov 0x28(%esp),%ebp \n"
+ "mov %esp,%ecx \n"
+ "sub $0x14,%esp \n"
+ "and $0xfffffff0,%esp \n"
+ "mov %ecx,0x10(%esp) \n"
+ "mov 0x2c(%ecx),%ecx \n"
+
+"1: \n"
+ "movdqa (%eax),%xmm0 \n"
+ "movdqa (%eax,%edi,1),%xmm1 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm0,%xmm7 \n"
+ "punpcklbw %xmm1,%xmm0 \n"
+ "punpckhbw %xmm1,%xmm7 \n"
+ "movdqa %xmm7,%xmm1 \n"
+ "movdqa (%eax),%xmm2 \n"
+ "movdqa (%eax,%edi,1),%xmm3 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm2,%xmm7 \n"
+ "punpcklbw %xmm3,%xmm2 \n"
+ "punpckhbw %xmm3,%xmm7 \n"
+ "movdqa %xmm7,%xmm3 \n"
+ "movdqa (%eax),%xmm4 \n"
+ "movdqa (%eax,%edi,1),%xmm5 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm4,%xmm7 \n"
+ "punpcklbw %xmm5,%xmm4 \n"
+ "punpckhbw %xmm5,%xmm7 \n"
+ "movdqa %xmm7,%xmm5 \n"
+ "movdqa (%eax),%xmm6 \n"
+ "movdqa (%eax,%edi,1),%xmm7 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm5,(%esp) \n"
+ "neg %edi \n"
+ "movdqa %xmm6,%xmm5 \n"
+ "punpcklbw %xmm7,%xmm6 \n"
+ "punpckhbw %xmm7,%xmm5 \n"
+ "movdqa %xmm5,%xmm7 \n"
+ "lea 0x10(%eax,%edi,8),%eax \n"
+ "neg %edi \n"
+ "movdqa %xmm0,%xmm5 \n"
+ "punpcklwd %xmm2,%xmm0 \n"
+ "punpckhwd %xmm2,%xmm5 \n"
+ "movdqa %xmm5,%xmm2 \n"
+ "movdqa %xmm1,%xmm5 \n"
+ "punpcklwd %xmm3,%xmm1 \n"
+ "punpckhwd %xmm3,%xmm5 \n"
+ "movdqa %xmm5,%xmm3 \n"
+ "movdqa %xmm4,%xmm5 \n"
+ "punpcklwd %xmm6,%xmm4 \n"
+ "punpckhwd %xmm6,%xmm5 \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "movdqa (%esp),%xmm5 \n"
+ "movdqa %xmm6,(%esp) \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "punpcklwd %xmm7,%xmm5 \n"
+ "punpckhwd %xmm7,%xmm6 \n"
+ "movdqa %xmm6,%xmm7 \n"
+ "movdqa %xmm0,%xmm6 \n"
+ "punpckldq %xmm4,%xmm0 \n"
+ "punpckhdq %xmm4,%xmm6 \n"
+ "movdqa %xmm6,%xmm4 \n"
+ "movdqa (%esp),%xmm6 \n"
+ "movlpd %xmm0,(%edx) \n"
+ "movhpd %xmm0,(%ebx) \n"
+ "movlpd %xmm4,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm4,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm2,%xmm0 \n"
+ "punpckldq %xmm6,%xmm2 \n"
+ "movlpd %xmm2,(%edx) \n"
+ "movhpd %xmm2,(%ebx) \n"
+ "punpckhdq %xmm6,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm1,%xmm0 \n"
+ "punpckldq %xmm5,%xmm1 \n"
+ "movlpd %xmm1,(%edx) \n"
+ "movhpd %xmm1,(%ebx) \n"
+ "punpckhdq %xmm5,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm3,%xmm0 \n"
+ "punpckldq %xmm7,%xmm3 \n"
+ "movlpd %xmm3,(%edx) \n"
+ "movhpd %xmm3,(%ebx) \n"
+ "punpckhdq %xmm7,%xmm0 \n"
+ "sub $0x8,%ecx \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "jg 1b \n"
+ "mov 0x10(%esp),%esp \n"
+ "pop %ebp \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "pop %ebx \n"
+ "ret \n"
+);
+#elif !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+#define HAS_TRANSPOSE_WX8_FAST_SSSE3
+static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 4 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqa (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqa (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqa (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqa (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "r"(static_cast<intptr_t>(dst_stride)) // %4
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+);
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int w) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 4 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(w) // %3
+ : "r"(static_cast<intptr_t>(src_stride)), // %4
+ "r"(static_cast<intptr_t>(dst_stride_a)), // %5
+ "r"(static_cast<intptr_t>(dst_stride_b)) // %6
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9"
+);
+}
+#endif
+#endif
+
+static void TransposeWx8_C(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ for (int i = 0; i < width; ++i) {
+ dst[0] = src[0 * src_stride];
+ dst[1] = src[1 * src_stride];
+ dst[2] = src[2 * src_stride];
+ dst[3] = src[3 * src_stride];
+ dst[4] = src[4 * src_stride];
+ dst[5] = src[5 * src_stride];
+ dst[6] = src[6 * src_stride];
+ dst[7] = src[7 * src_stride];
+ ++src;
+ dst += dst_stride;
+ }
+}
+
+static void TransposeWxH_C(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ for (int i = 0; i < width; ++i) {
+ for (int j = 0; j < height; ++j) {
+ dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
+}
+
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ void (*TransposeWx8)(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSE_WX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeWx8 = TransposeWx8_NEON;
+ }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_SSSE3;
+ }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ TransposeWx8 = TransposeWx8_FAST_SSSE3;
+ }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+ if (IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
+ } else {
+ TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ // Work across the source in 8x8 tiles
+ int i = height;
+ while (i >= 8) {
+ TransposeWx8(src, src_stride, dst, dst_stride, width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
+ i -= 8;
+ }
+
+ TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 90 is a transpose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 270 is a transpose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+ MirrorRow = MirrorRow_MIPS_DSPR2;
+ }
+#endif
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+ if (width > kMaxStride) {
+ return;
+ }
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ // Odd height will harmlessly mirror the middle row twice.
+ for (int y = 0; y < half_height; ++y) {
+ MirrorRow(src, row, width); // Mirror first row into a buffer
+ src += src_stride;
+ MirrorRow(src_bot, dst, width); // Mirror last row into first row
+ dst += dst_stride;
+ CopyRow(row, dst_bot, width); // Copy first mirrored row into last
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+}
+
+static void TransposeUVWx8_C(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ for (int i = 0; i < width; ++i) {
+ dst_a[0] = src[0 * src_stride + 0];
+ dst_b[0] = src[0 * src_stride + 1];
+ dst_a[1] = src[1 * src_stride + 0];
+ dst_b[1] = src[1 * src_stride + 1];
+ dst_a[2] = src[2 * src_stride + 0];
+ dst_b[2] = src[2 * src_stride + 1];
+ dst_a[3] = src[3 * src_stride + 0];
+ dst_b[3] = src[3 * src_stride + 1];
+ dst_a[4] = src[4 * src_stride + 0];
+ dst_b[4] = src[4 * src_stride + 1];
+ dst_a[5] = src[5 * src_stride + 0];
+ dst_b[5] = src[5 * src_stride + 1];
+ dst_a[6] = src[6 * src_stride + 0];
+ dst_b[6] = src[6 * src_stride + 1];
+ dst_a[7] = src[7 * src_stride + 0];
+ dst_b[7] = src[7 * src_stride + 1];
+ src += 2;
+ dst_a += dst_stride_a;
+ dst_b += dst_stride_b;
+ }
+}
+
+static void TransposeUVWxH_C(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ for (int i = 0; i < width * 2; i += 2)
+ for (int j = 0; j < height; ++j) {
+ dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+ dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+ }
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ void (*TransposeUVWx8)(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx8_C;
+#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeUVWx8 = TransposeUVWx8_NEON;
+ }
+#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
+ }
+#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+ }
+#endif
+
+ // Work through the source in 8x8 tiles.
+ int i = height;
+ while (i >= 8) {
+ TransposeUVWx8(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst_a += 8; // Move over 8 columns.
+ dst_b += 8; // Move over 8 columns.
+ i -= 8;
+ }
+
+ TransposeUVWxH_C(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, i);
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+
+ TransposeUV(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ dst_a += dst_stride_a * (width - 1);
+ dst_b += dst_stride_b * (width - 1);
+ dst_stride_a = -dst_stride_a;
+ dst_stride_b = -dst_stride_b;
+
+ TransposeUV(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+ MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ MirrorRowUV = MirrorUVRow_NEON;
+ }
+#elif defined(HAS_MIRRORROW_UV_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ MirrorRowUV = MirrorUVRow_SSSE3;
+ }
+#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
+ }
+#endif
+
+ dst_a += dst_stride_a * (height - 1);
+ dst_b += dst_stride_b * (height - 1);
+
+ for (int i = 0; i < height; ++i) {
+ MirrorRowUV(src, dst_a, dst_b, width);
+ src += src_stride;
+ dst_a -= dst_stride_a;
+ dst_b -= dst_stride_b;
+ }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height,
+ RotationMode mode) {
+ if (!src || width <= 0 || height == 0 || !dst) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ RotationMode mode) {
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
+ return -1;
+ }
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return I420Copy(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotatePlane90(src_u, src_stride_u,
+ dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ RotatePlane90(src_v, src_stride_v,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotatePlane270(src_u, src_stride_u,
+ dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ RotatePlane270(src_v, src_stride_v,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotatePlane180(src_u, src_stride_u,
+ dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ RotatePlane180(src_v, src_stride_v,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ RotationMode mode) {
+ if (!src_y || !src_uv || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
+ return -1;
+ }
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return NV12ToI420(src_y, src_stride_y,
+ src_uv, src_stride_uv,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotateUV90(src_uv, src_stride_uv,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotateUV270(src_uv, src_stride_uv,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotateUV180(src_uv, src_stride_uv,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libyuv/source/rotate_argb.cc
new file mode 100644
index 00000000000..5fa0d7ea798
--- /dev/null
+++ b/chromium/third_party/libyuv/source/rotate_argb.cc
@@ -0,0 +1,213 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
+ defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ int src_pixel_step = src_stride >> 2;
+ void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+ int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest.
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+ }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest.
+ IS_ALIGNED(src, 4)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+ }
+#endif
+
+ for (int i = 0; i < width; ++i) { // column of source to row of dest.
+ ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+ dst += dst_stride;
+ src += 4;
+ }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 90 is a ARGBTranspose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 270 is a ARGBTranspose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+#endif
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+ bool direct = width * 4 > kMaxStride;
+
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ // Odd height will harmlessly mirror the middle row twice.
+ for (int y = 0; y < half_height; ++y) {
+ if (direct) {
+ ARGBMirrorRow(src, dst_bot, width); // Mirror first row into a buffer
+ if (src != src_bot) {
+ ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
+ }
+ } else {
+ ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
+ ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
+ CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ }
+ src += src_stride;
+ dst += dst_stride;
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ RotationMode mode) {
+ if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return ARGBCopy(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ case kRotate90:
+ ARGBRotate90(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ case kRotate270:
+ ARGBRotate270(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ case kRotate180:
+ ARGBRotate180(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/rotate_mips.cc b/chromium/third_party/libyuv/source/rotate_mips.cc
new file mode 100644
index 00000000000..04d5a663f77
--- /dev/null
+++ b/chromium/third_party/libyuv/source/rotate_mips.cc
@@ -0,0 +1,486 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "sw $s0, 0(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "sw $s1, 4(%[dst]) \n"
+ "bnez %[width], 1b \n"
+ " addu %[dst], %[dst], %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "swr $s0, 0(%[dst]) \n"
+ "swl $s0, 3(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "swr $s1, 4(%[dst]) \n"
+ "swl $s1, 7(%[dst]) \n"
+ "bnez %[width], 11b \n"
+ "addu %[dst], %[dst], %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1"
+ );
+}
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ __asm__ __volatile__ (
+ ".set noat \n"
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+
+ "srl $AT, %[width], 0x2 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "sw $s4, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $s6, 0($s0) \n"
+ "sw $t8, 4($s0) \n"
+ "sw $s5, 0($s1) \n"
+ "sw $t1, 4($s1) \n"
+ "sw $s7, 0($s2) \n"
+ "sw $t9, 4($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 1b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "swr $s4, 0(%[dst]) \n"
+ "swl $s4, 3(%[dst]) \n"
+ "swr $t0, 4(%[dst]) \n"
+ "swl $t0, 7(%[dst]) \n"
+ "swr $s6, 0($s0) \n"
+ "swl $s6, 3($s0) \n"
+ "swr $t8, 4($s0) \n"
+ "swl $t8, 7($s0) \n"
+ "swr $s5, 0($s1) \n"
+ "swl $s5, 3($s1) \n"
+ "swr $t1, 4($s1) \n"
+ "swl $t1, 7($s1) \n"
+ "swr $s7, 0($s2) \n"
+ "swl $s7, 3($s2) \n"
+ "swr $t9, 4($s2) \n"
+ "swl $t9, 7($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 11b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ ".set at \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3", "s4",
+ "s5", "s6", "s7"
+ );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "subu $t7, $t9, %[src_stride] \n"
+ "srl $t1, %[width], 1 \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+ "andi $t0, %[dst_a], 0x3 \n"
+ "andi $t8, %[dst_b], 0x3 \n"
+ "or $t0, $t0, $t8 \n"
+ "andi $t8, %[dst_stride_a], 0x3 \n"
+ "andi $s5, %[dst_stride_b], 0x3 \n"
+ "or $t8, $t8, $s5 \n"
+ "or $t0, $t0, $t8 \n"
+ "bnez $t0, 11f \n"
+ " nop \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+ "1: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "sw $s3, 0($s5) \n"
+ "sw $s4, 0($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "sw $s3, 0(%[dst_a]) \n"
+ "sw $s4, 0(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+ "sw $s3, 4($s5) \n"
+ "sw $s4, 4($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "sw $s3, 4(%[dst_a]) \n"
+ "sw $s4, 4(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 1b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+ "b 2f \n"
+ " nop \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "swr $s3, 0($s5) \n"
+ "swl $s3, 3($s5) \n"
+ "swr $s4, 0($s6) \n"
+ "swl $s4, 3($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "swr $s3, 0(%[dst_a]) \n"
+ "swl $s3, 3(%[dst_a]) \n"
+ "swr $s4, 0(%[dst_b]) \n"
+ "swl $s4, 3(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+
+ "swr $s3, 4($s5) \n"
+ "swl $s3, 7($s5) \n"
+ "swr $s4, 4($s6) \n"
+ "swl $s4, 7($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "swr $s3, 4(%[dst_a]) \n"
+ "swl $s3, 7(%[dst_a]) \n"
+ "swr $s4, 4(%[dst_b]) \n"
+ "swl $s4, 7(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 11b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+
+ "2: \n"
+ ".set pop \n"
+ : [src] "+r" (src),
+ [dst_a] "+r" (dst_a),
+ [dst_b] "+r" (dst_b),
+ [width] "+r" (width),
+ [src_stride] "+r" (src_stride)
+ : [dst_stride_a] "r" (dst_stride_a),
+ [dst_stride_b] "r" (dst_stride_b)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc
new file mode 100644
index 00000000000..ab07c169703
--- /dev/null
+++ b/chromium/third_party/libyuv/source/rotate_neon.cc
@@ -0,0 +1,405 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+static const uvec8 kVTbl4x4Transpose =
+ { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %4, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 4 \n"
+ "1: \n"
+ "mov r9, %0 \n"
+
+ "vld1.8 {d0}, [r9], %1 \n"
+ "vld1.8 {d1}, [r9], %1 \n"
+ "vld1.8 {d2}, [r9], %1 \n"
+ "vld1.8 {d3}, [r9], %1 \n"
+ "vld1.8 {d4}, [r9], %1 \n"
+ "vld1.8 {d5}, [r9], %1 \n"
+ "vld1.8 {d6}, [r9], %1 \n"
+ "vld1.8 {d7}, [r9] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.8 {d1}, [r9], %3 \n"
+ "vst1.8 {d0}, [r9], %3 \n"
+ "vst1.8 {d3}, [r9], %3 \n"
+ "vst1.8 {d2}, [r9], %3 \n"
+ "vst1.8 {d5}, [r9], %3 \n"
+ "vst1.8 {d4}, [r9], %3 \n"
+ "vst1.8 {d7}, [r9], %3 \n"
+ "vst1.8 {d6}, [r9] \n"
+
+ "add %0, #8 \n" // src += 8
+ "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %4, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %4, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %4, #2 \n"
+ "blt 3f \n"
+
+ "cmp %4, #4 \n"
+ "blt 2f \n"
+
+ // 4x8 block
+ "mov r9, %0 \n"
+ "vld1.32 {d0[0]}, [r9], %1 \n"
+ "vld1.32 {d0[1]}, [r9], %1 \n"
+ "vld1.32 {d1[0]}, [r9], %1 \n"
+ "vld1.32 {d1[1]}, [r9], %1 \n"
+ "vld1.32 {d2[0]}, [r9], %1 \n"
+ "vld1.32 {d2[1]}, [r9], %1 \n"
+ "vld1.32 {d3[0]}, [r9], %1 \n"
+ "vld1.32 {d3[1]}, [r9] \n"
+
+ "mov r9, %2 \n"
+
+ "vld1.8 {q3}, [%5] \n"
+
+ "vtbl.8 d4, {d0, d1}, d6 \n"
+ "vtbl.8 d5, {d0, d1}, d7 \n"
+ "vtbl.8 d0, {d2, d3}, d6 \n"
+ "vtbl.8 d1, {d2, d3}, d7 \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ "vst1.32 {d4[0]}, [r9], %3 \n"
+ "vst1.32 {d4[1]}, [r9], %3 \n"
+ "vst1.32 {d5[0]}, [r9], %3 \n"
+ "vst1.32 {d5[1]}, [r9] \n"
+
+ "add r9, %2, #4 \n"
+ "vst1.32 {d0[0]}, [r9], %3 \n"
+ "vst1.32 {d0[1]}, [r9], %3 \n"
+ "vst1.32 {d1[0]}, [r9], %3 \n"
+ "vst1.32 {d1[1]}, [r9] \n"
+
+ "add %0, #4 \n" // src += 4
+ "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %4, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %4, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov r9, %0 \n"
+ "vld1.16 {d0[0]}, [r9], %1 \n"
+ "vld1.16 {d1[0]}, [r9], %1 \n"
+ "vld1.16 {d0[1]}, [r9], %1 \n"
+ "vld1.16 {d1[1]}, [r9], %1 \n"
+ "vld1.16 {d0[2]}, [r9], %1 \n"
+ "vld1.16 {d1[2]}, [r9], %1 \n"
+ "vld1.16 {d0[3]}, [r9], %1 \n"
+ "vld1.16 {d1[3]}, [r9] \n"
+
+ "vtrn.8 d0, d1 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.64 {d0}, [r9], %3 \n"
+ "vst1.64 {d1}, [r9] \n"
+
+ "add %0, #2 \n" // src += 2
+ "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %4, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld1.8 {d0[0]}, [%0], %1 \n"
+ "vld1.8 {d0[1]}, [%0], %1 \n"
+ "vld1.8 {d0[2]}, [%0], %1 \n"
+ "vld1.8 {d0[3]}, [%0], %1 \n"
+ "vld1.8 {d0[4]}, [%0], %1 \n"
+ "vld1.8 {d0[5]}, [%0], %1 \n"
+ "vld1.8 {d0[6]}, [%0], %1 \n"
+ "vld1.8 {d0[7]}, [%0] \n"
+
+ "vst1.64 {d0}, [%2] \n"
+
+ "4: \n"
+
+ : "+r"(src), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_stride), // %3
+ "+r"(width) // %4
+ : "r"(&kVTbl4x4Transpose) // %5
+ : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
+ );
+}
+
+static const uvec8 kVTbl4x4TransposeDi =
+ { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %6, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 4 \n"
+ "1: \n"
+ "mov r9, %0 \n"
+
+ "vld2.8 {d0, d1}, [r9], %1 \n"
+ "vld2.8 {d2, d3}, [r9], %1 \n"
+ "vld2.8 {d4, d5}, [r9], %1 \n"
+ "vld2.8 {d6, d7}, [r9], %1 \n"
+ "vld2.8 {d16, d17}, [r9], %1 \n"
+ "vld2.8 {d18, d19}, [r9], %1 \n"
+ "vld2.8 {d20, d21}, [r9], %1 \n"
+ "vld2.8 {d22, d23}, [r9] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.8 {d2}, [r9], %3 \n"
+ "vst1.8 {d0}, [r9], %3 \n"
+ "vst1.8 {d6}, [r9], %3 \n"
+ "vst1.8 {d4}, [r9], %3 \n"
+ "vst1.8 {d18}, [r9], %3 \n"
+ "vst1.8 {d16}, [r9], %3 \n"
+ "vst1.8 {d22}, [r9], %3 \n"
+ "vst1.8 {d20}, [r9] \n"
+
+ "mov r9, %4 \n"
+
+ "vst1.8 {d3}, [r9], %5 \n"
+ "vst1.8 {d1}, [r9], %5 \n"
+ "vst1.8 {d7}, [r9], %5 \n"
+ "vst1.8 {d5}, [r9], %5 \n"
+ "vst1.8 {d19}, [r9], %5 \n"
+ "vst1.8 {d17}, [r9], %5 \n"
+ "vst1.8 {d23}, [r9], %5 \n"
+ "vst1.8 {d21}, [r9] \n"
+
+ "add %0, #8*2 \n" // src += 8*2
+ "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %6, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %6, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %6, #2 \n"
+ "blt 3f \n"
+
+ "cmp %6, #4 \n"
+ "blt 2f \n"
+
+ //TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov r9, %0 \n"
+ "vld1.64 {d0}, [r9], %1 \n"
+ "vld1.64 {d1}, [r9], %1 \n"
+ "vld1.64 {d2}, [r9], %1 \n"
+ "vld1.64 {d3}, [r9], %1 \n"
+ "vld1.64 {d4}, [r9], %1 \n"
+ "vld1.64 {d5}, [r9], %1 \n"
+ "vld1.64 {d6}, [r9], %1 \n"
+ "vld1.64 {d7}, [r9] \n"
+
+ "vld1.8 {q15}, [%7] \n"
+
+ "vtrn.8 q0, q1 \n"
+ "vtrn.8 q2, q3 \n"
+
+ "vtbl.8 d16, {d0, d1}, d30 \n"
+ "vtbl.8 d17, {d0, d1}, d31 \n"
+ "vtbl.8 d18, {d2, d3}, d30 \n"
+ "vtbl.8 d19, {d2, d3}, d31 \n"
+ "vtbl.8 d20, {d4, d5}, d30 \n"
+ "vtbl.8 d21, {d4, d5}, d31 \n"
+ "vtbl.8 d22, {d6, d7}, d30 \n"
+ "vtbl.8 d23, {d6, d7}, d31 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.32 {d16[0]}, [r9], %3 \n"
+ "vst1.32 {d16[1]}, [r9], %3 \n"
+ "vst1.32 {d17[0]}, [r9], %3 \n"
+ "vst1.32 {d17[1]}, [r9], %3 \n"
+
+ "add r9, %2, #4 \n"
+ "vst1.32 {d20[0]}, [r9], %3 \n"
+ "vst1.32 {d20[1]}, [r9], %3 \n"
+ "vst1.32 {d21[0]}, [r9], %3 \n"
+ "vst1.32 {d21[1]}, [r9] \n"
+
+ "mov r9, %4 \n"
+
+ "vst1.32 {d18[0]}, [r9], %5 \n"
+ "vst1.32 {d18[1]}, [r9], %5 \n"
+ "vst1.32 {d19[0]}, [r9], %5 \n"
+ "vst1.32 {d19[1]}, [r9], %5 \n"
+
+ "add r9, %4, #4 \n"
+ "vst1.32 {d22[0]}, [r9], %5 \n"
+ "vst1.32 {d22[1]}, [r9], %5 \n"
+ "vst1.32 {d23[0]}, [r9], %5 \n"
+ "vst1.32 {d23[1]}, [r9] \n"
+
+ "add %0, #4*2 \n" // src += 4 * 2
+ "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
+ "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
+ "subs %6, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %6, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov r9, %0 \n"
+ "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
+ "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
+ "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
+ "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
+ "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
+ "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
+ "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
+ "vld2.16 {d1[3], d3[3]}, [r9] \n"
+
+ "vtrn.8 d0, d1 \n"
+ "vtrn.8 d2, d3 \n"
+
+ "mov r9, %2 \n"
+
+ "vst1.64 {d0}, [r9], %3 \n"
+ "vst1.64 {d2}, [r9] \n"
+
+ "mov r9, %4 \n"
+
+ "vst1.64 {d1}, [r9], %5 \n"
+ "vst1.64 {d3}, [r9] \n"
+
+ "add %0, #2*2 \n" // src += 2 * 2
+ "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
+ "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
+ "subs %6, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
+ "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
+ "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
+ "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
+ "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
+ "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
+ "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
+ "vld2.8 {d0[7], d1[7]}, [%0] \n"
+
+ "vst1.64 {d0}, [%2] \n"
+ "vst1.64 {d1}, [%4] \n"
+
+ "4: \n"
+
+ : "+r"(src), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_a), // %2
+ "+r"(dst_stride_a), // %3
+ "+r"(dst_b), // %4
+ "+r"(dst_stride_b), // %5
+ "+r"(width) // %6
+ : "r"(&kVTbl4x4TransposeDi) // %7
+ : "memory", "cc", "r9",
+ "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_any.cc b/chromium/third_party/libyuv/source/row_any.cc
new file mode 100644
index 00000000000..72100d90e9d
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_any.cc
@@ -0,0 +1,519 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.
+// TODO(fbarchard): Consider 'any' functions handling odd alignment.
+// YUV to RGB does multiple of 8 with SIMD and remainder with C.
+#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \
+ void NAMEANY(const uint8* y_buf, \
+ const uint8* u_buf, \
+ const uint8* v_buf, \
+ uint8* rgb_buf, \
+ int width) { \
+ int n = width & ~MASK; \
+ I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \
+ I420TORGB_C(y_buf + n, \
+ u_buf + (n >> UV_SHIFT), \
+ v_buf + (n >> UV_SHIFT), \
+ rgb_buf + n * BPP, width & MASK); \
+ }
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+ 0, 4, 7)
+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
+ 1, 4, 7)
+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
+ 2, 4, 7)
+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
+ 1, 4, 7)
+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,
+ 1, 4, 7)
+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,
+ 1, 4, 7)
+// I422ToRGB565Row_SSSE3 is unaligned.
+YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,
+ 1, 2, 7)
+YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,
+ 1, 2, 7)
+YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,
+ 1, 2, 7)
+// I422ToRGB24Row_SSSE3 is unaligned.
+YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
+YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
+YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
+YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
+#endif // HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_I422TOARGBROW_AVX2
+YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
+#endif // HAS_I422TOARGBROW_AVX2
+#ifdef HAS_I422TOARGBROW_NEON
+YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)
+YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)
+YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7)
+YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7)
+YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7)
+YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7)
+YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7)
+YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7)
+YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,
+ 1, 2, 7)
+YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,
+ 1, 2, 7)
+YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)
+YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)
+YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)
+#endif // HAS_I422TOARGBROW_NEON
+#undef YANY
+
+// Wrappers to handle odd width
+#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP) \
+ void NAMEANY(const uint8* y_buf, \
+ const uint8* uv_buf, \
+ uint8* rgb_buf, \
+ int width) { \
+ int n = width & ~7; \
+ NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \
+ NV12TORGB_C(y_buf + n, \
+ uv_buf + (n >> UV_SHIFT), \
+ rgb_buf + n * BPP, width & 7); \
+ }
+
+#ifdef HAS_NV12TOARGBROW_SSSE3
+NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,
+ 0, 4)
+NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,
+ 0, 4)
+#endif // HAS_NV12TOARGBROW_SSSE3
+#ifdef HAS_NV12TOARGBROW_NEON
+NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)
+NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)
+#endif // HAS_NV12TOARGBROW_NEON
+#ifdef HAS_NV12TORGB565ROW_SSSE3
+NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,
+ 0, 2)
+NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,
+ 0, 2)
+#endif // HAS_NV12TORGB565ROW_SSSE3
+#ifdef HAS_NV12TORGB565ROW_NEON
+NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)
+NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)
+#endif // HAS_NV12TORGB565ROW_NEON
+#undef NVANY
+
+#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
+ void NAMEANY(const uint8* src, \
+ uint8* dst, \
+ int width) { \
+ int n = width & ~MASK; \
+ ARGBTORGB_SIMD(src, dst, n); \
+ ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \
+ }
+
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,
+ 15, 4, 3)
+RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,
+ 15, 4, 3)
+RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,
+ 3, 4, 2)
+RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
+ 3, 4, 2)
+RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
+ 3, 4, 2)
+RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
+ 7, 1, 4)
+RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
+ 7, 1, 4)
+RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
+ 15, 2, 4)
+RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
+ 15, 2, 4)
+// These require alignment on ARGB, so C is used for remainder.
+RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,
+ 15, 3, 4)
+RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,
+ 15, 3, 4)
+RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,
+ 7, 2, 4)
+RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,
+ 7, 2, 4)
+RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,
+ 7, 2, 4)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_NEON)
+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)
+RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)
+RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,
+ 7, 4, 2)
+RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,
+ 7, 4, 2)
+RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
+ 7, 4, 2)
+RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
+ 7, 1, 4)
+RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
+ 7, 1, 4)
+RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
+ 7, 2, 4)
+RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
+ 7, 2, 4)
+#endif
+#undef RGBANY
+
+// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
+#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \
+ void NAMEANY(const uint8* src, \
+ uint8* dst, uint32 selector, \
+ int width) { \
+ int n = width & ~MASK; \
+ ARGBTORGB_SIMD(src, dst, selector, n); \
+ ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \
+ }
+
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
+ 7, 4, 1)
+#endif
+#if defined(HAS_ARGBTOBAYERROW_NEON)
+BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
+ 7, 4, 1)
+#endif
+#undef BAYERANY
+
+// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
+#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \
+ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
+ ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \
+ ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \
+ dst_y + (width - NUM) * BPP, NUM); \
+ }
+
+#ifdef HAS_ARGBTOYROW_AVX2
+YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)
+YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)
+YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)
+YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
+#endif
+#ifdef HAS_ARGBTOYROW_SSSE3
+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)
+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)
+#endif
+#ifdef HAS_ARGBTOYJROW_SSSE3
+YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)
+#endif
+#ifdef HAS_ARGBTOYROW_NEON
+YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)
+YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)
+YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)
+YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)
+YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)
+YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)
+YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)
+YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)
+YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)
+YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)
+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)
+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)
+YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)
+YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)
+YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)
+YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
+#endif
+#undef YANY
+
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
+ int n = width & ~MASK; \
+ ARGBTOY_SIMD(src_argb, dst_y, n); \
+ ARGBTOY_C(src_argb + n * SBPP, \
+ dst_y + n * BPP, width & MASK); \
+ }
+
+// Attenuate is destructive so last16 method can not be used due to overlap.
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
+ 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
+ 4, 4, 3)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C,
+ 4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C,
+ 4, 4, 7)
+#endif
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,
+ 4, 4, 7)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
+ 4, 4, 7)
+#endif
+#undef YANY
+
+// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
+#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \
+ void NAMEANY(const uint8* src_argb, int src_stride_argb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
+ ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
+ dst_u + (n >> 1), \
+ dst_v + (n >> 1), \
+ width & MASK); \
+ }
+
+#ifdef HAS_ARGBTOYROW_AVX2
+UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
+UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
+UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
+UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,
+ 4, 15)
+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
+#endif
+#ifdef HAS_ARGBTOUVROW_NEON
+UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
+UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)
+UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
+UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
+UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
+UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
+UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
+UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
+UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
+UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
+UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
+UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
+#endif
+#undef UVANY
+
+#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \
+ void NAMEANY(const uint8* src_uv, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ ANYTOUV_C(src_uv + n * BPP, \
+ dst_u + (n >> SHIFT), \
+ dst_v + (n >> SHIFT), \
+ width & MASK); \
+ }
+
+#ifdef HAS_ARGBTOUV444ROW_SSSE3
+UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,
+ ARGBToUV444Row_C, 4, 15, 0)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_AVX2
+UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,
+ YUY2ToUV422Row_C, 2, 31, 1)
+UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,
+ UYVYToUV422Row_C, 2, 31, 1)
+#endif
+#ifdef HAS_ARGBTOUVROW_SSSE3
+UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,
+ ARGBToUV422Row_C, 4, 15, 1)
+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,
+ YUY2ToUV422Row_C, 2, 15, 1)
+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,
+ UYVYToUV422Row_C, 2, 15, 1)
+#endif
+#ifdef HAS_YUY2TOUV422ROW_NEON
+UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,
+ ARGBToUV444Row_C, 4, 7, 0)
+UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,
+ ARGBToUV422Row_C, 4, 15, 1)
+UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,
+ ARGBToUV411Row_C, 4, 31, 2)
+UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,
+ YUY2ToUV422Row_C, 2, 15, 1)
+UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
+ UYVYToUV422Row_C, 2, 15, 1)
+#endif
+#undef UV422ANY
+
+#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
+ void NAMEANY(const uint8* src_uv, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \
+ ANYTOUV_C(src_uv + n * 2, \
+ dst_u + n, \
+ dst_v + n, \
+ width & MASK); \
+ }
+
+#ifdef HAS_SPLITUVROW_SSE2
+SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
+#endif
+#ifdef HAS_SPLITUVROW_AVX2
+SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)
+#endif
+#ifdef HAS_SPLITUVROW_NEON
+SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
+#endif
+#ifdef HAS_SPLITUVROW_MIPS_DSPR2
+SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
+ SplitUVRow_C, 15)
+#endif
+#undef SPLITUVROWANY
+
+#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \
+ void NAMEANY(const uint8* src_u, const uint8* src_v, \
+ uint8* dst_uv, int width) { \
+ int n = width & ~MASK; \
+ ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \
+ ANYTOUV_C(src_u + n, \
+ src_v + n, \
+ dst_uv + n * 2, \
+ width & MASK); \
+ }
+
+#ifdef HAS_MERGEUVROW_SSE2
+MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
+#endif
+#ifdef HAS_MERGEUVROW_AVX2
+MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)
+#endif
+#ifdef HAS_MERGEUVROW_NEON
+MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
+#endif
+#undef MERGEUVROW_ANY
+
+#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK) \
+ void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \
+ uint8* dst_argb, int width) { \
+ int n = width & ~MASK; \
+ ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \
+ ARGBMATH_C(src_argb0 + n * 4, \
+ src_argb1 + n * 4, \
+ dst_argb + n * 4, \
+ width & MASK); \
+ }
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C,
+ 3)
+#endif
+#ifdef HAS_ARGBADDROW_SSE2
+MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,
+ 3)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,
+ 7)
+#endif
+#ifdef HAS_ARGBADDROW_AVX2
+MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,
+ 7)
+#endif
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,
+ 7)
+#endif
+#ifdef HAS_ARGBADDROW_NEON
+MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
+ 7)
+#endif
+#undef MATHROW_ANY
+
+// Shuffle may want to work in place, so last16 method can not be used.
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \
+ void NAMEANY(const uint8* src_argb, uint8* dst_argb, \
+ const uint8* shuffler, int width) { \
+ int n = width & ~MASK; \
+ ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \
+ ARGBTOY_C(src_argb + n * SBPP, \
+ dst_argb + n * BPP, shuffler, width & MASK); \
+ }
+
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
+YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
+ ARGBShuffleRow_C, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,
+ ARGBShuffleRow_C, 4, 4, 15)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
+ ARGBShuffleRow_C, 4, 4, 3)
+#endif
+#undef YANY
+
+// Interpolate may want to work in place, so last16 method can not be used.
+#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \
+ void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \
+ ptrdiff_t src_stride_ptr, int width, \
+ int source_y_fraction) { \
+ int n = width & ~MASK; \
+ TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \
+ n, source_y_fraction); \
+ TERP_C(dst_ptr + n * BPP, \
+ src_ptr + n * SBPP, src_stride_ptr, \
+ width & MASK, source_y_fraction); \
+ }
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
+ InterpolateRow_C, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_SSE2
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,
+ InterpolateRow_C, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_NEON
+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
+ InterpolateRow_C, 1, 1, 15)
+#endif
+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
+ InterpolateRow_C, 1, 1, 3)
+#endif
+#undef NANY
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc
new file mode 100644
index 00000000000..badea440582
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_common.cc
@@ -0,0 +1,1995 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include <string.h> // For memcpy and memset.
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// llvm x86 is poor at ternary operator, so use branchless min/max.
+
+#define USE_BRANCHLESS 1
+#if USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return ((-(v) >> 31) & (v));
+}
+
+static __inline int32 clamp255(int32 v) {
+ return (((255 - (v)) >> 31) | (v)) & 255;
+}
+
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return static_cast<uint32>(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+ int m = v >> 31;
+ return (v + m) ^ m;
+}
+#else // USE_BRANCHLESS
+static __inline int32 clamp0(int32 v) {
+ return (v < 0) ? 0 : v;
+}
+
+static __inline int32 clamp255(int32 v) {
+ return (v > 255) ? 255 : v;
+}
+
+static __inline uint32 Clamp(int32 val) {
+ int v = clamp0(val);
+ return static_cast<uint32>(clamp255(v));
+}
+
+static __inline uint32 Abs(int32 v) {
+ return (v < 0) ? -v : v;
+}
+#endif // USE_BRANCHLESS
+
+#ifdef LIBYUV_LITTLE_ENDIAN
+#define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
+#else
+static inline void WRITEWORD(uint8* p, uint32 v) {
+ p[0] = (uint8)(v & 255);
+ p[1] = (uint8)((v >> 8) & 255);
+ p[2] = (uint8)((v >> 16) & 255);
+ p[3] = (uint8)((v >> 24) & 255);
+}
+#endif
+
+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_rgb24[0];
+ uint8 g = src_rgb24[1];
+ uint8 r = src_rgb24[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb24 += 3;
+ }
+}
+
+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 r = src_raw[0];
+ uint8 g = src_raw[1];
+ uint8 b = src_raw[2];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_raw += 3;
+ }
+}
+
+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 2) | (g >> 4);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ src_rgb565 += 2;
+ }
+}
+
+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+ int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 a = src_argb1555[1] >> 7;
+ dst_argb[0] = (b << 3) | (b >> 2);
+ dst_argb[1] = (g << 3) | (g >> 2);
+ dst_argb[2] = (r << 3) | (r >> 2);
+ dst_argb[3] = -a;
+ dst_argb += 4;
+ src_argb1555 += 2;
+ }
+}
+
+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+ int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ uint8 a = src_argb4444[1] >> 4;
+ dst_argb[0] = (b << 4) | b;
+ dst_argb[1] = (g << 4) | g;
+ dst_argb[2] = (r << 4) | r;
+ dst_argb[3] = (a << 4) | a;
+ dst_argb += 4;
+ src_argb4444 += 2;
+ }
+}
+
+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = b;
+ dst_rgb[1] = g;
+ dst_rgb[2] = r;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb[0];
+ uint8 g = src_argb[1];
+ uint8 r = src_argb[2];
+ dst_rgb[0] = r;
+ dst_rgb[1] = g;
+ dst_rgb[2] = b;
+ dst_rgb += 3;
+ src_argb += 4;
+ }
+}
+
+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 2;
+ uint8 r1 = src_argb[6] >> 3;
+ WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27));
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 2;
+ uint8 r0 = src_argb[2] >> 3;
+ *reinterpret_cast<uint16*>(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ uint8 b1 = src_argb[4] >> 3;
+ uint8 g1 = src_argb[5] >> 3;
+ uint8 r1 = src_argb[6] >> 3;
+ uint8 a1 = src_argb[7] >> 7;
+ *reinterpret_cast<uint32*>(dst_rgb) =
+ b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 3;
+ uint8 g0 = src_argb[1] >> 3;
+ uint8 r0 = src_argb[2] >> 3;
+ uint8 a0 = src_argb[3] >> 7;
+ *reinterpret_cast<uint16*>(dst_rgb) =
+ b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+ }
+}
+
+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ uint8 b1 = src_argb[4] >> 4;
+ uint8 g1 = src_argb[5] >> 4;
+ uint8 r1 = src_argb[6] >> 4;
+ uint8 a1 = src_argb[7] >> 4;
+ *reinterpret_cast<uint32*>(dst_rgb) =
+ b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+ dst_rgb += 4;
+ src_argb += 8;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb[0] >> 4;
+ uint8 g0 = src_argb[1] >> 4;
+ uint8 r0 = src_argb[2] >> 4;
+ uint8 a0 = src_argb[3] >> 4;
+ *reinterpret_cast<uint16*>(dst_rgb) =
+ b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+ }
+}
+
+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
+ return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
+}
+
+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+ return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
+}
+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+ return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
+}
+
+#define MAKEROWY(NAME, R, G, B, BPP) \
+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ for (int x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+} \
+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ for (int x = 0; x < width - 1; x += 2) { \
+ uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] + \
+ src_rgb1[B] + src_rgb1[B + BPP]) >> 2; \
+ uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] + \
+ src_rgb1[G] + src_rgb1[G + BPP]) >> 2; \
+ uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] + \
+ src_rgb1[R] + src_rgb1[R + BPP]) >> 2; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
+ uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
+ uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
+ dst_u[0] = RGBToU(ar, ag, ab); \
+ dst_v[0] = RGBToV(ar, ag, ab); \
+ } \
+}
+
+MAKEROWY(ARGB, 2, 1, 0, 4)
+MAKEROWY(BGRA, 1, 2, 3, 4)
+MAKEROWY(ABGR, 0, 1, 2, 4)
+MAKEROWY(RGBA, 3, 2, 1, 4)
+MAKEROWY(RGB24, 2, 1, 0, 3)
+MAKEROWY(RAW, 0, 1, 2, 3)
+#undef MAKEROWY
+
+// JPeg uses a variation on BT.601-1 full range
+// y = 0.29900 * r + 0.58700 * g + 0.11400 * b
+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center
+// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center
+// BT.601 Mpeg range uses:
+// b 0.1016 * 255 = 25.908 = 25
+// g 0.5078 * 255 = 129.489 = 129
+// r 0.2578 * 255 = 65.739 = 66
+// JPeg 8 bit Y (not used):
+// b 0.11400 * 256 = 29.184 = 29
+// g 0.58700 * 256 = 150.272 = 150
+// r 0.29900 * 256 = 76.544 = 77
+// JPeg 7 bit Y:
+// b 0.11400 * 128 = 14.592 = 15
+// g 0.58700 * 128 = 75.136 = 75
+// r 0.29900 * 128 = 38.272 = 38
+// JPeg 8 bit U:
+// b 0.50000 * 255 = 127.5 = 127
+// g -0.33126 * 255 = -84.4713 = -84
+// r -0.16874 * 255 = -43.0287 = -43
+// JPeg 8 bit V:
+// b -0.08131 * 255 = -20.73405 = -20
+// g -0.41869 * 255 = -106.76595 = -107
+// r 0.50000 * 255 = 127.5 = 127
+
+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
+ return (38 * r + 75 * g + 15 * b + 64) >> 7;
+}
+
+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
+}
+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+ return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
+}
+
+#define AVGB(a, b) (((a) + (b) + 1) >> 1)
+
+#define MAKEROWYJ(NAME, R, G, B, BPP) \
+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
+ for (int x = 0; x < width; ++x) { \
+ dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
+ src_argb0 += BPP; \
+ dst_y += 1; \
+ } \
+} \
+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \
+ uint8* dst_u, uint8* dst_v, int width) { \
+ const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
+ for (int x = 0; x < width - 1; x += 2) { \
+ uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
+ AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
+ uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
+ AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
+ uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
+ AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ src_rgb0 += BPP * 2; \
+ src_rgb1 += BPP * 2; \
+ dst_u += 1; \
+ dst_v += 1; \
+ } \
+ if (width & 1) { \
+ uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \
+ uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \
+ uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \
+ dst_u[0] = RGBToUJ(ar, ag, ab); \
+ dst_v[0] = RGBToVJ(ar, ag, ab); \
+ } \
+}
+
+MAKEROWYJ(ARGB, 2, 1, 0, 4)
+#undef MAKEROWYJ
+
+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_rgb565[0] & 0x1f;
+ uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r = src_rgb565[1] >> 3;
+ b = (b << 3) | (b >> 2);
+ g = (g << 2) | (g >> 4);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_rgb565 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb1555[0] & 0x1f;
+ uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+ b = (b << 3) | (b >> 2);
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb1555 += 2;
+ dst_y += 1;
+ }
+}
+
+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 b = src_argb4444[0] & 0x0f;
+ uint8 g = src_argb4444[0] >> 4;
+ uint8 r = src_argb4444[1] & 0x0f;
+ b = (b << 4) | b;
+ g = (g << 4) | g;
+ r = (r << 4) | r;
+ dst_y[0] = RGBToY(r, g, b);
+ src_argb4444 += 2;
+ dst_y += 1;
+ }
+}
+
+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b1 = src_rgb565[2] & 0x1f;
+ uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+ uint8 r1 = src_rgb565[3] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b3 = next_rgb565[2] & 0x1f;
+ uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+ uint8 r3 = next_rgb565[3] >> 3;
+ uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 787 -> 888.
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_rgb565 += 4;
+ next_rgb565 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_rgb565[0] & 0x1f;
+ uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+ uint8 r0 = src_rgb565[1] >> 3;
+ uint8 b2 = next_rgb565[0] & 0x1f;
+ uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+ uint8 r2 = next_rgb565[1] >> 3;
+ uint8 b = (b0 + b2); // 565 * 2 = 676.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 676 -> 888
+ g = (g << 1) | (g >> 6);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b1 = src_argb1555[2] & 0x1f;
+ uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+ uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
+ uint8 b3 = next_argb1555[2] & 0x1f;
+ uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+ uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
+ uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 1) | (b >> 6); // 777 -> 888.
+ g = (g << 1) | (g >> 6);
+ r = (r << 1) | (r >> 6);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb1555 += 4;
+ next_argb1555 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb1555[0] & 0x1f;
+ uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+ uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
+ uint8 b2 = next_argb1555[0] & 0x1f;
+ uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+ uint8 r2 = next_argb1555[1] >> 3;
+ uint8 b = (b0 + b2); // 555 * 2 = 666.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) {
+ const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b1 = src_argb4444[2] & 0x0f;
+ uint8 g1 = src_argb4444[2] >> 4;
+ uint8 r1 = src_argb4444[3] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b3 = next_argb4444[2] & 0x0f;
+ uint8 g3 = next_argb4444[2] >> 4;
+ uint8 r3 = next_argb4444[3] & 0x0f;
+ uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666.
+ uint8 g = (g0 + g1 + g2 + g3);
+ uint8 r = (r0 + r1 + r2 + r3);
+ b = (b << 2) | (b >> 4); // 666 -> 888.
+ g = (g << 2) | (g >> 4);
+ r = (r << 2) | (r >> 4);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ src_argb4444 += 4;
+ next_argb4444 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 b0 = src_argb4444[0] & 0x0f;
+ uint8 g0 = src_argb4444[0] >> 4;
+ uint8 r0 = src_argb4444[1] & 0x0f;
+ uint8 b2 = next_argb4444[0] & 0x0f;
+ uint8 g2 = next_argb4444[0] >> 4;
+ uint8 r2 = next_argb4444[1] & 0x0f;
+ uint8 b = (b0 + b2); // 444 * 2 = 555.
+ uint8 g = (g0 + g2);
+ uint8 r = (r0 + r2);
+ b = (b << 3) | (b >> 2); // 555 -> 888.
+ g = (g << 3) | (g >> 2);
+ r = (r << 3) | (r >> 2);
+ dst_u[0] = RGBToU(r, g, b);
+ dst_v[0] = RGBToV(r, g, b);
+ }
+}
+
+void ARGBToUV444Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+void ARGBToUV422Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 8;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if (width & 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ }
+}
+
+void ARGBToUV411Row_C(const uint8* src_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ for (int x = 0; x < width - 3; x += 4) {
+ uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
+ uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
+ uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ src_argb += 16;
+ dst_u += 1;
+ dst_v += 1;
+ }
+ if ((width & 3) == 3) {
+ uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;
+ uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;
+ uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ } else if ((width & 3) == 2) {
+ uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
+ uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
+ uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ } else if ((width & 3) == 1) {
+ uint8 ab = src_argb[0];
+ uint8 ag = src_argb[1];
+ uint8 ar = src_argb[2];
+ dst_u[0] = RGBToU(ar, ag, ab);
+ dst_v[0] = RGBToV(ar, ag, ab);
+ }
+}
+
+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = src_argb[3];
+ dst_argb += 4;
+ src_argb += 4;
+ }
+}
+
+// Convert a row of image to Sepia tone.
+void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int sb = (b * 17 + g * 68 + r * 35) >> 7;
+ int sg = (b * 22 + g * 88 + r * 45) >> 7;
+ int sr = (b * 24 + g * 98 + r * 50) >> 7;
+ // b does not over flow. a is preserved from original.
+ dst_argb[0] = sb;
+ dst_argb[1] = clamp255(sg);
+ dst_argb[2] = clamp255(sr);
+ dst_argb += 4;
+ }
+}
+
+// Apply color matrix to a row of image. Matrix is signed.
+void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int a = dst_argb[3];
+ int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
+ r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+ int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
+ r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+ int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
+ r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
+ dst_argb[0] = Clamp(sb);
+ dst_argb[1] = Clamp(sg);
+ dst_argb[2] = Clamp(sr);
+ dst_argb += 4;
+ }
+}
+
+// Apply color table to a row of image.
+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ int a = dst_argb[3];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb[3] = table_argb[a * 4 + 3];
+ dst_argb += 4;
+ }
+}
+
+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;
+ dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;
+ dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;
+ dst_argb += 4;
+ }
+}
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 24
+
+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ const uint32 b_scale = REPEAT8(value & 0xff);
+ const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
+ const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
+ const uint32 a_scale = REPEAT8(value >> 24);
+
+ for (int i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb[0]);
+ const uint32 g = REPEAT8(src_argb[1]);
+ const uint32 r = REPEAT8(src_argb[2]);
+ const uint32 a = REPEAT8(src_argb[3]);
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define REPEAT8(v) (v) | ((v) << 8)
+#define SHADE(f, v) v * f >> 16
+
+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ for (int i = 0; i < width; ++i) {
+ const uint32 b = REPEAT8(src_argb0[0]);
+ const uint32 g = REPEAT8(src_argb0[1]);
+ const uint32 r = REPEAT8(src_argb0[2]);
+ const uint32 a = REPEAT8(src_argb0[3]);
+ const uint32 b_scale = src_argb1[0];
+ const uint32 g_scale = src_argb1[1];
+ const uint32 r_scale = src_argb1[2];
+ const uint32 a_scale = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_scale);
+ dst_argb[1] = SHADE(g, g_scale);
+ dst_argb[2] = SHADE(r, r_scale);
+ dst_argb[3] = SHADE(a, a_scale);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef REPEAT8
+#undef SHADE
+
+#define SHADE(f, v) clamp255(v + f)
+
+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ for (int i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_add = src_argb1[0];
+ const int g_add = src_argb1[1];
+ const int r_add = src_argb1[2];
+ const int a_add = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_add);
+ dst_argb[1] = SHADE(g, g_add);
+ dst_argb[2] = SHADE(r, r_add);
+ dst_argb[3] = SHADE(a, a_add);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+#define SHADE(f, v) clamp0(f - v)
+
+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ for (int i = 0; i < width; ++i) {
+ const int b = src_argb0[0];
+ const int g = src_argb0[1];
+ const int r = src_argb0[2];
+ const int a = src_argb0[3];
+ const int b_sub = src_argb1[0];
+ const int g_sub = src_argb1[1];
+ const int r_sub = src_argb1[2];
+ const int a_sub = src_argb1[3];
+ dst_argb[0] = SHADE(b, b_sub);
+ dst_argb[1] = SHADE(g, g_sub);
+ dst_argb[2] = SHADE(r, r_sub);
+ dst_argb[3] = SHADE(a, a_sub);
+ src_argb0 += 4;
+ src_argb1 += 4;
+ dst_argb += 4;
+ }
+}
+#undef SHADE
+
+// Sobel functions which mimics SSSE3.
+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
+ uint8* dst_sobelx, int width) {
+ for (int i = 0; i < width; ++i) {
+ int a = src_y0[i];
+ int b = src_y1[i];
+ int c = src_y2[i];
+ int a_sub = src_y0[i + 2];
+ int b_sub = src_y1[i + 2];
+ int c_sub = src_y2[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobelx[i] = static_cast<uint8>(clamp255(sobel));
+ }
+}
+
+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ for (int i = 0; i < width; ++i) {
+ int a = src_y0[i + 0];
+ int b = src_y0[i + 1];
+ int c = src_y0[i + 2];
+ int a_sub = src_y1[i + 0];
+ int b_sub = src_y1[i + 1];
+ int c_sub = src_y1[i + 2];
+ int a_diff = a - a_sub;
+ int b_diff = b - b_sub;
+ int c_diff = c - c_sub;
+ int sobel = Abs(a_diff + b_diff * 2 + c_diff);
+ dst_sobely[i] = static_cast<uint8>(clamp255(sobel));
+ }
+}
+
+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ for (int i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_argb[0] = static_cast<uint8>(s);
+ dst_argb[1] = static_cast<uint8>(s);
+ dst_argb[2] = static_cast<uint8>(s);
+ dst_argb[3] = static_cast<uint8>(255u);
+ dst_argb += 4;
+ }
+}
+
+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ for (int i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int g = clamp255(r + b);
+ dst_argb[0] = static_cast<uint8>(b);
+ dst_argb[1] = static_cast<uint8>(g);
+ dst_argb[2] = static_cast<uint8>(r);
+ dst_argb[3] = static_cast<uint8>(255u);
+ dst_argb += 4;
+ }
+}
+
+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+ // Copy a Y to RGB.
+ for (int x = 0; x < width; ++x) {
+ uint8 y = src_y[0];
+ dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
+ dst_argb[3] = 255u;
+ dst_argb += 4;
+ ++src_y;
+ }
+}
+
+// C reference code that mimics the YUV assembly.
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
+ uint8* b, uint8* g, uint8* r) {
+ int32 y1 = (static_cast<int32>(y) - 16) * YG;
+ *b = Clamp(static_cast<int32>((u * UB + v * VB) - (BB) + y1) >> 6);
+ *g = Clamp(static_cast<int32>((u * UG + v * VG) - (BG) + y1) >> 6);
+ *r = Clamp(static_cast<int32>((u * UR + v * VR) - (BR) + y1) >> 6);
+}
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+// C mimic assembly.
+// TODO(fbarchard): Remove subsampling from Neon.
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
+ uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+ YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 2;
+ src_v += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ }
+}
+#else
+void I444ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width; ++x) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ src_y += 1;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 4; // Advance 1 pixel.
+ }
+}
+#endif
+// Also used for 420
+void I422ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToRGB24Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ }
+}
+
+void I422ToRAWRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 6; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ }
+}
+
+void I422ToARGB4444Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ b1 = b1 >> 4;
+ g1 = g1 >> 4;
+ r1 = r1 >> 4;
+ *reinterpret_cast<uint32*>(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+ (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb4444 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ b0 = b0 >> 4;
+ g0 = g0 >> 4;
+ r0 = r0 >> 4;
+ *reinterpret_cast<uint16*>(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
+ 0xf000;
+ }
+}
+
+void I422ToARGB1555Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 3;
+ r1 = r1 >> 3;
+ *reinterpret_cast<uint32*>(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+ (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_argb1555 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 3;
+ r0 = r0 >> 3;
+ *reinterpret_cast<uint16*>(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
+ 0x8000;
+ }
+}
+
+void I422ToRGB565Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void I411ToARGBRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 3; x += 4) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ YuvPixel(src_y[2], src_u[0], src_v[0],
+ rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);
+ rgb_buf[11] = 255;
+ YuvPixel(src_y[3], src_u[0], src_v[0],
+ rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);
+ rgb_buf[15] = 255;
+ src_y += 4;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 16; // Advance 4 pixels.
+ }
+ if (width & 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV12ToARGBRow_C(const uint8* src_y,
+ const uint8* usrc_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], usrc_v[0], usrc_v[1],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ usrc_v += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV21ToARGBRow_C(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+
+ YuvPixel(src_y[1], src_vu[1], src_vu[0],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+
+ src_y += 2;
+ src_vu += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_vu[1], src_vu[0],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void NV12ToRGB565Row_C(const uint8* src_y,
+ const uint8* usrc_v,
+ uint8* dst_rgb565,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+ YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ usrc_v += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void NV21ToRGB565Row_C(const uint8* src_y,
+ const uint8* vsrc_u,
+ uint8* dst_rgb565,
+ int width) {
+ uint8 b0;
+ uint8 g0;
+ uint8 r0;
+ uint8 b1;
+ uint8 g1;
+ uint8 r1;
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+ YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ b1 = b1 >> 3;
+ g1 = g1 >> 2;
+ r1 = r1 >> 3;
+ *reinterpret_cast<uint32*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
+ (b1 << 16) | (g1 << 21) | (r1 << 27);
+ src_y += 2;
+ vsrc_u += 2;
+ dst_rgb565 += 4; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);
+ b0 = b0 >> 3;
+ g0 = g0 >> 2;
+ r0 = r0 >> 3;
+ *reinterpret_cast<uint16*>(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+ }
+}
+
+void YUY2ToARGBRow_C(const uint8* src_yuy2,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_yuy2 += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void UYVYToARGBRow_C(const uint8* src_uyvy,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_uyvy += 4;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToBGRARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);
+ rgb_buf[0] = 255;
+ }
+}
+
+void I422ToABGRRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);
+ rgb_buf[3] = 255;
+ }
+}
+
+void I422ToRGBARow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+ rgb_buf[0] = 255;
+ YuvPixel(src_y[1], src_u[0], src_v[0],
+ rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);
+ rgb_buf[4] = 255;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], src_u[0], src_v[0],
+ rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);
+ rgb_buf[0] = 255;
+ }
+}
+
+void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ YuvPixel(src_y[0], 128, 128,
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ YuvPixel(src_y[1], 128, 128,
+ rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);
+ rgb_buf[7] = 255;
+ src_y += 2;
+ rgb_buf += 8; // Advance 2 pixels.
+ }
+ if (width & 1) {
+ YuvPixel(src_y[0], 128, 128,
+ rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
+ rgb_buf[3] = 255;
+ }
+}
+
+void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+ src += width - 1;
+ for (int x = 0; x < width - 1; x += 2) {
+ dst[x] = src[0];
+ dst[x + 1] = src[-1];
+ src -= 2;
+ }
+ if (width & 1) {
+ dst[width - 1] = src[0];
+ }
+}
+
+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ src_uv += (width - 1) << 1;
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[-2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[-2 + 1];
+ src_uv -= 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+ const uint32* src32 = reinterpret_cast<const uint32*>(src);
+ uint32* dst32 = reinterpret_cast<uint32*>(dst);
+ src32 += width - 1;
+ for (int x = 0; x < width - 1; x += 2) {
+ dst32[x] = src32[0];
+ dst32[x + 1] = src32[-1];
+ src32 -= 2;
+ }
+ if (width & 1) {
+ dst32[width - 1] = src32[0];
+ }
+}
+
+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_u[x] = src_uv[0];
+ dst_u[x + 1] = src_uv[2];
+ dst_v[x] = src_uv[1];
+ dst_v[x + 1] = src_uv[3];
+ src_uv += 4;
+ }
+ if (width & 1) {
+ dst_u[width - 1] = src_uv[0];
+ dst_v[width - 1] = src_uv[1];
+ }
+}
+
+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_uv[0] = src_u[x];
+ dst_uv[1] = src_v[x];
+ dst_uv[2] = src_u[x + 1];
+ dst_uv[3] = src_v[x + 1];
+ dst_uv += 4;
+ }
+ if (width & 1) {
+ dst_uv[0] = src_u[width - 1];
+ dst_uv[1] = src_v[width - 1];
+ }
+}
+
+void CopyRow_C(const uint8* src, uint8* dst, int count) {
+ memcpy(dst, src, count);
+}
+
+void SetRow_C(uint8* dst, uint32 v8, int count) {
+#ifdef _MSC_VER
+ // VC will generate rep stosb.
+ for (int x = 0; x < count; ++x) {
+ dst[x] = v8;
+ }
+#else
+ memset(dst, v8, count);
+#endif
+}
+
+void ARGBSetRows_C(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ uint32* d = reinterpret_cast<uint32*>(dst);
+ for (int x = 0; x < width; ++x) {
+ d[x] = v32;
+ }
+ dst += dst_stride;
+ }
+}
+
+// Filter 2 rows of YUY2 UV's (422) into U and V (420).
+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values, filtering 2 rows of YUY2.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
+ dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 UV's (422) into U and V (422).
+void YUY2ToUV422Row_C(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = src_yuy2[1];
+ dst_v[0] = src_yuy2[3];
+ src_yuy2 += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of YUY2 Y's (422) into Y (420/422).
+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_yuy2[0];
+ dst_y[x + 1] = src_yuy2[2];
+ src_yuy2 += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_yuy2[0];
+ }
+}
+
+// Filter 2 rows of UYVY UV's (422) into U and V (420).
+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
+ dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY UV's (422) into U and V (422).
+void UYVYToUV422Row_C(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int width) {
+ // Output a row of UV values.
+ for (int x = 0; x < width; x += 2) {
+ dst_u[0] = src_uyvy[0];
+ dst_v[0] = src_uyvy[2];
+ src_uyvy += 4;
+ dst_u += 1;
+ dst_v += 1;
+ }
+}
+
+// Copy row of UYVY Y's (422) into Y (420/422).
+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+ // Output a row of Y values.
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_y[x] = src_uyvy[1];
+ dst_y[x + 1] = src_uyvy[3];
+ src_uyvy += 4;
+ }
+ if (width & 1) {
+ dst_y[width - 1] = src_uyvy[1];
+ }
+}
+
+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
+
+// Blend src_argb0 over src_argb1 and store to dst_argb.
+// dst_argb may be src_argb0 or src_argb1.
+// This code mimics the SSSE3 version for better testability.
+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+
+ fb = src_argb0[4 + 0];
+ fg = src_argb0[4 + 1];
+ fr = src_argb0[4 + 2];
+ a = src_argb0[4 + 3];
+ bb = src_argb1[4 + 0];
+ bg = src_argb1[4 + 1];
+ br = src_argb1[4 + 2];
+ dst_argb[4 + 0] = BLEND(fb, bb, a);
+ dst_argb[4 + 1] = BLEND(fg, bg, a);
+ dst_argb[4 + 2] = BLEND(fr, br, a);
+ dst_argb[4 + 3] = 255u;
+ src_argb0 += 8;
+ src_argb1 += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ uint32 fb = src_argb0[0];
+ uint32 fg = src_argb0[1];
+ uint32 fr = src_argb0[2];
+ uint32 a = src_argb0[3];
+ uint32 bb = src_argb1[0];
+ uint32 bg = src_argb1[1];
+ uint32 br = src_argb1[2];
+ dst_argb[0] = BLEND(fb, bb, a);
+ dst_argb[1] = BLEND(fg, bg, a);
+ dst_argb[2] = BLEND(fr, br, a);
+ dst_argb[3] = 255u;
+ }
+}
+#undef BLEND
+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
+
+// Multiply source RGB by alpha and store to destination.
+// This code mimics the SSSE3 version for better testability.
+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int i = 0; i < width - 1; i += 2) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ b = src_argb[4];
+ g = src_argb[5];
+ r = src_argb[6];
+ a = src_argb[7];
+ dst_argb[4] = ATTENUATE(b, a);
+ dst_argb[5] = ATTENUATE(g, a);
+ dst_argb[6] = ATTENUATE(r, a);
+ dst_argb[7] = a;
+ src_argb += 8;
+ dst_argb += 8;
+ }
+
+ if (width & 1) {
+ const uint32 b = src_argb[0];
+ const uint32 g = src_argb[1];
+ const uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ dst_argb[0] = ATTENUATE(b, a);
+ dst_argb[1] = ATTENUATE(g, a);
+ dst_argb[2] = ATTENUATE(r, a);
+ dst_argb[3] = a;
+ }
+}
+#undef ATTENUATE
+
+// Divide source RGB by alpha and store to destination.
+// b = (b * 255 + (a / 2)) / a;
+// g = (g * 255 + (a / 2)) / a;
+// r = (r * 255 + (a / 2)) / a;
+// Reciprocal method is off by 1 on some values. ie 125
+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
+#define T(a) 0x01000000 + (0x10000 / a)
+uint32 fixed_invtbl8[256] = {
+ 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+ T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
+ T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
+ T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
+ T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
+ T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
+ T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
+ T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
+ T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
+ T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
+ T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
+ T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
+ T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
+ T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
+ T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
+ T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
+ T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
+ T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
+ T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
+ T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
+ T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
+ T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
+ T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
+ T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
+ T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
+ T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
+ T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
+ T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
+ T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
+ T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
+ T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
+ T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+#undef T
+
+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+ for (int i = 0; i < width; ++i) {
+ uint32 b = src_argb[0];
+ uint32 g = src_argb[1];
+ uint32 r = src_argb[2];
+ const uint32 a = src_argb[3];
+ const uint32 ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point
+ b = (b * ia) >> 8;
+ g = (g * ia) >> 8;
+ r = (r * ia) >> 8;
+ // Clamping should not be necessary but is free in assembly.
+ dst_argb[0] = clamp255(b);
+ dst_argb[1] = clamp255(g);
+ dst_argb[2] = clamp255(r);
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ int32 row_sum[4] = {0, 0, 0, 0};
+ for (int x = 0; x < width; ++x) {
+ row_sum[0] += row[x * 4 + 0];
+ row_sum[1] += row[x * 4 + 1];
+ row_sum[2] += row[x * 4 + 2];
+ row_sum[3] += row[x * 4 + 3];
+ cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+ cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+ cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+ cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
+ }
+}
+
+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
+ int w, int area, uint8* dst, int count) {
+ float ooa = 1.0f / area;
+ for (int i = 0; i < count; ++i) {
+ dst[0] = static_cast<uint8>((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+ dst[1] = static_cast<uint8>((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+ dst[2] = static_cast<uint8>((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+ dst[3] = static_cast<uint8>((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+ dst += 4;
+ tl += 4;
+ bl += 4;
+ }
+}
+
+// Copy pixels from rotated source to destination row with a slope.
+LIBYUV_API
+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width) {
+ // Render a row of pixels from source into a buffer.
+ float uv[2];
+ uv[0] = uv_dudv[0];
+ uv[1] = uv_dudv[1];
+ for (int i = 0; i < width; ++i) {
+ int x = static_cast<int>(uv[0]);
+ int y = static_cast<int>(uv[1]);
+ *reinterpret_cast<uint32*>(dst_argb) =
+ *reinterpret_cast<const uint32*>(src_argb + y * src_argb_stride +
+ x * 4);
+ dst_argb += 4;
+ uv[0] += uv_dudv[2];
+ uv[1] += uv_dudv[3];
+ }
+}
+
+// C version 2x2 -> 2x1.
+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ int width, int source_y_fraction) {
+ int y1_fraction = source_y_fraction;
+ int y0_fraction = 256 - y1_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+ src_ptr += 2;
+ src_ptr1 += 2;
+ dst_ptr += 2;
+ }
+ if (width & 1) {
+ dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+ }
+}
+
+// Blend 2 rows into 1 for conversions such as I422ToI420.
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ for (int x = 0; x < pix; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
+// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
+void ARGBToBayerRow_C(const uint8* src_argb,
+ uint8* dst_bayer, uint32 selector, int pix) {
+ int index0 = selector & 0xff;
+ int index1 = (selector >> 8) & 0xff;
+ // Copy a row of Bayer.
+ for (int x = 0; x < pix - 1; x += 2) {
+ dst_bayer[0] = src_argb[index0];
+ dst_bayer[1] = src_argb[index1];
+ src_argb += 8;
+ dst_bayer += 2;
+ }
+ if (pix & 1) {
+ dst_bayer[0] = src_argb[index0];
+ }
+}
+
+// Use first 4 shuffler values to reorder ARGB channels.
+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ int index0 = shuffler[0];
+ int index1 = shuffler[1];
+ int index2 = shuffler[2];
+ int index3 = shuffler[3];
+ // Shuffle a row of ARGB.
+ for (int x = 0; x < pix; ++x) {
+ // To support in-place conversion.
+ uint8 b = src_argb[index0];
+ uint8 g = src_argb[index1];
+ uint8 r = src_argb[index2];
+ uint8 a = src_argb[index3];
+ dst_argb[0] = b;
+ dst_argb[1] = g;
+ dst_argb[2] = r;
+ dst_argb[3] = a;
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void I422ToYUY2Row_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[1];
+ dst_frame[3] = src_v[0];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_y[0];
+ dst_frame[1] = src_u[0];
+ dst_frame[2] = src_y[0]; // duplicate last y
+ dst_frame[3] = src_v[0];
+ }
+}
+
+void I422ToUYVYRow_C(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ for (int x = 0; x < width - 1; x += 2) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[1];
+ dst_frame += 4;
+ src_y += 2;
+ src_u += 1;
+ src_v += 1;
+ }
+ if (width & 1) {
+ dst_frame[0] = src_u[0];
+ dst_frame[1] = src_y[0];
+ dst_frame[2] = src_v[0];
+ dst_frame[3] = src_y[0]; // duplicate last y
+ }
+}
+
+#if !defined(LIBYUV_DISABLE_X86)
+// row_win.cc has asm version, but GCC uses 2 step wrapper. 5% slower.
+// TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
+#if defined(__x86_64__) || defined(__i386__)
+void I422ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+ ARGBToRGB565Row_SSE2(row, rgb_buf, width);
+}
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+void I422ToARGB1555Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+ ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
+}
+
+void I422ToARGB4444Row_SSSE3(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* rgb_buf,
+ int width) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);
+ ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
+}
+
+void NV12ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+}
+
+void NV21ToRGB565Row_SSSE3(const uint8* src_y,
+ const uint8* src_vu,
+ uint8* dst_rgb565,
+ int width) {
+ SIMD_ALIGNED(uint8 row[kMaxStride]);
+ NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);
+ ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
+}
+
+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width) {
+ SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+ YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);
+ YUY2ToYRow_SSE2(src_yuy2, row_y, width);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
+}
+
+void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width) {
+ SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+ YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);
+ YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);
+ I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
+}
+
+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width) {
+ SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+ UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);
+ UYVYToYRow_SSE2(src_uyvy, row_y, width);
+ I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
+}
+
+void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width) {
+ SIMD_ALIGNED(uint8 row_y[kMaxStride]);
+ SIMD_ALIGNED(uint8 row_u[kMaxStride / 2]);
+ SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]);
+ UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);
+ UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
+ I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
+}
+
+#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+#endif // !defined(LIBYUV_DISABLE_X86)
+#undef clamp0
+#undef clamp255
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_mips.cc b/chromium/third_party/libyuv/source/row_mips.cc
new file mode 100644
index 00000000000..69677aa2d5b
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_mips.cc
@@ -0,0 +1,974 @@
+/*
+ * Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAS_COPYROW_MIPS
+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
+ __asm__ __volatile__ (
+ ".set noreorder \n"
+ ".set noat \n"
+ "slti $at, %[count], 8 \n"
+ "bne $at ,$zero, $last8 \n"
+ "xor $t8, %[src], %[dst] \n"
+ "andi $t8, $t8, 0x3 \n"
+
+ "bne $t8, $zero, unaligned \n"
+ "negu $a3, %[dst] \n"
+ // make dst/src aligned
+ "andi $a3, $a3, 0x3 \n"
+ "beq $a3, $zero, $chk16w \n"
+ // word-aligned now count is the remining bytes count
+ "subu %[count], %[count], $a3 \n"
+
+ "lwr $t8, 0(%[src]) \n"
+ "addu %[src], %[src], $a3 \n"
+ "swr $t8, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+
+ // Now the dst/src are mutually word-aligned with word-aligned addresses
+ "$chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, chk8w \n"
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n"
+ // t0 is the "past the end" address
+
+ // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
+ // the "t0-32" address
+ // This means: for x=128 the last "safe" a1 address is "t0-160"
+ // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
+ // we will use "pref 30,128(a1)", so "t0-160" is the limit
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line of src
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $loop16w \n"
+ "nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lw $t0, 0(%[src]) \n"
+ "bgtz $v1, $skip_pref30_96 \n" // skip
+ "lw $t1, 4(%[src]) \n"
+ "pref 30, 96(%[dst]) \n" // continue
+ "$skip_pref30_96: \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lw $t0, 32(%[src]) \n"
+ "bgtz $v1, $skip_pref30_128 \n" // skip pref 30,128(a1)
+ "lw $t1, 36(%[src]) \n"
+ "pref 30, 128(%[dst]) \n" // set dest, addr 128
+ "$skip_pref30_128: \n"
+ "lw $t2, 40(%[src]) \n"
+ "lw $t3, 44(%[src]) \n"
+ "lw $t4, 48(%[src]) \n"
+ "lw $t5, 52(%[src]) \n"
+ "lw $t6, 56(%[src]) \n"
+ "lw $t7, 60(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst], %[dst], 64 \n" // adding 64 to dest
+ "sgtu $v1, %[dst], $t9 \n"
+ "bne %[dst], $a3, $loop16w \n"
+ " addiu %[src], %[src], 64 \n" // adding 64 to src
+ "move %[count], $t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count past 32-bytes
+ "beq %[count], $t8, chk1w \n"
+ // count=t8,no 32-byte chunk
+ " nop \n"
+
+ "lw $t0, 0(%[src]) \n"
+ "lw $t1, 4(%[src]) \n"
+ "lw $t2, 8(%[src]) \n"
+ "lw $t3, 12(%[src]) \n"
+ "lw $t4, 16(%[src]) \n"
+ "lw $t5, 20(%[src]) \n"
+ "lw $t6, 24(%[src]) \n"
+ "lw $t7, 28(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, $last8 \n"
+ " subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+ // copying in words (4-byte chunks)
+ "$wordCopy_loop: \n"
+ "lw $t3, 0(%[src]) \n"
+ // the first t3 may be equal t0 ... optimize?
+ "addiu %[src], %[src],4 \n"
+ "addiu %[dst], %[dst],4 \n"
+ "bne %[dst], $a3,$wordCopy_loop \n"
+ " sw $t3, -4(%[dst]) \n"
+
+ // For the last (<8) bytes
+ "$last8: \n"
+ "blez %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 -last dst address
+ "$last8loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst], $a3, $last8loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "leave: \n"
+ " j $ra \n"
+ " nop \n"
+
+ //
+ // UNALIGNED case
+ //
+
+ "unaligned: \n"
+ // got here with a3="negu a1"
+ "andi $a3, $a3, 0x3 \n" // a1 is word aligned?
+ "beqz $a3, $ua_chk16w \n"
+ " subu %[count], %[count], $a3 \n"
+ // bytes left after initial a3 bytes
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addu %[src], %[src], $a3 \n" // a3 may be 1, 2 or 3
+ "swr $v1, 0(%[dst]) \n"
+ "addu %[dst], %[dst], $a3 \n"
+ // below the dst will be word aligned (NOTE1)
+ "$ua_chk16w: \n"
+ "andi $t8, %[count], 0x3f \n" // whole 64-B chunks?
+ // t8 is the byte count after 64-byte chunks
+ "beq %[count], $t8, ua_chk8w \n"
+ // if a2==t8, no 64-byte chunks
+ // There will be at most 1 32-byte chunk after it
+ "subu $a3, %[count], $t8 \n" // the reminder
+ // Here a3 counts bytes in 16w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // Now a3 is the final dst after 64-byte chunks
+ "addu $t0, %[dst], %[count] \n" // t0 "past the end"
+ "subu $t9, $t0, 160 \n"
+ // t9 is the "last safe pref 30,128(a1)" address
+ "pref 0, 0(%[src]) \n" // first line of src
+ "pref 0, 32(%[src]) \n" // second line addr 32
+ "pref 0, 64(%[src]) \n"
+ "pref 30, 32(%[dst]) \n"
+ // safe, as we have at least 64 bytes ahead
+ // In case the a1 > t9 don't use "pref 30" at all
+ "sgtu $v1, %[dst], $t9 \n"
+ "bgtz $v1, $ua_loop16w \n"
+ // skip "pref 30,64(a1)" for too short arrays
+ " nop \n"
+ // otherwise, start with using pref30
+ "pref 30, 64(%[dst]) \n"
+ "$ua_loop16w: \n"
+ "pref 0, 96(%[src]) \n"
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "bgtz $v1, $ua_skip_pref30_96 \n"
+ " lwl $t1, 7(%[src]) \n"
+ "pref 30, 96(%[dst]) \n"
+ // continue setting up the dest, addr 96
+ "$ua_skip_pref30_96: \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "pref 0, 128(%[src]) \n"
+ // bring the next lines of src, addr 128
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "lwr $t0, 32(%[src]) \n"
+ "lwl $t0, 35(%[src]) \n"
+ "lwr $t1, 36(%[src]) \n"
+ "bgtz $v1, ua_skip_pref30_128 \n"
+ " lwl $t1, 39(%[src]) \n"
+ "pref 30, 128(%[dst]) \n"
+ // continue setting up the dest, addr 128
+ "ua_skip_pref30_128: \n"
+
+ "lwr $t2, 40(%[src]) \n"
+ "lwl $t2, 43(%[src]) \n"
+ "lwr $t3, 44(%[src]) \n"
+ "lwl $t3, 47(%[src]) \n"
+ "lwr $t4, 48(%[src]) \n"
+ "lwl $t4, 51(%[src]) \n"
+ "lwr $t5, 52(%[src]) \n"
+ "lwl $t5, 55(%[src]) \n"
+ "lwr $t6, 56(%[src]) \n"
+ "lwl $t6, 59(%[src]) \n"
+ "lwr $t7, 60(%[src]) \n"
+ "lwl $t7, 63(%[src]) \n"
+ "pref 0, 160(%[src]) \n"
+ // bring the next lines of src, addr 160
+ "sw $t0, 32(%[dst]) \n"
+ "sw $t1, 36(%[dst]) \n"
+ "sw $t2, 40(%[dst]) \n"
+ "sw $t3, 44(%[dst]) \n"
+ "sw $t4, 48(%[dst]) \n"
+ "sw $t5, 52(%[dst]) \n"
+ "sw $t6, 56(%[dst]) \n"
+ "sw $t7, 60(%[dst]) \n"
+
+ "addiu %[dst],%[dst],64 \n" // adding 64 to dest
+ "sgtu $v1,%[dst],$t9 \n"
+ "bne %[dst],$a3,$ua_loop16w \n"
+ " addiu %[src],%[src],64 \n" // adding 64 to src
+ "move %[count],$t8 \n"
+
+ // Here we have src and dest word-aligned but less than 64-bytes to go
+
+ "ua_chk8w: \n"
+ "pref 0, 0x0(%[src]) \n"
+ "andi $t8, %[count], 0x1f \n" // 32-byte chunk?
+ // the t8 is the reminder count
+ "beq %[count], $t8, $ua_chk1w \n"
+ // when count==t8, no 32-byte chunk
+
+ "lwr $t0, 0(%[src]) \n"
+ "lwl $t0, 3(%[src]) \n"
+ "lwr $t1, 4(%[src]) \n"
+ "lwl $t1, 7(%[src]) \n"
+ "lwr $t2, 8(%[src]) \n"
+ "lwl $t2, 11(%[src]) \n"
+ "lwr $t3, 12(%[src]) \n"
+ "lwl $t3, 15(%[src]) \n"
+ "lwr $t4, 16(%[src]) \n"
+ "lwl $t4, 19(%[src]) \n"
+ "lwr $t5, 20(%[src]) \n"
+ "lwl $t5, 23(%[src]) \n"
+ "lwr $t6, 24(%[src]) \n"
+ "lwl $t6, 27(%[src]) \n"
+ "lwr $t7, 28(%[src]) \n"
+ "lwl $t7, 31(%[src]) \n"
+ "addiu %[src], %[src], 32 \n"
+
+ "sw $t0, 0(%[dst]) \n"
+ "sw $t1, 4(%[dst]) \n"
+ "sw $t2, 8(%[dst]) \n"
+ "sw $t3, 12(%[dst]) \n"
+ "sw $t4, 16(%[dst]) \n"
+ "sw $t5, 20(%[dst]) \n"
+ "sw $t6, 24(%[dst]) \n"
+ "sw $t7, 28(%[dst]) \n"
+ "addiu %[dst], %[dst], 32 \n"
+
+ "$ua_chk1w: \n"
+ "andi %[count], $t8, 0x3 \n"
+ // now count is the reminder past 1w chunks
+ "beq %[count], $t8, ua_smallCopy \n"
+ "subu $a3, $t8, %[count] \n"
+ // a3 is count of bytes in 1w chunks
+ "addu $a3, %[dst], $a3 \n"
+ // now a3 is the dst address past the 1w chunks
+
+ // copying in words (4-byte chunks)
+ "$ua_wordCopy_loop: \n"
+ "lwr $v1, 0(%[src]) \n"
+ "lwl $v1, 3(%[src]) \n"
+ "addiu %[src], %[src], 4 \n"
+ "addiu %[dst], %[dst], 4 \n"
+ // note: dst=a1 is word aligned here, see NOTE1
+ "bne %[dst], $a3, $ua_wordCopy_loop \n"
+ " sw $v1,-4(%[dst]) \n"
+
+ // Now less than 4 bytes (value in count) left to copy
+ "ua_smallCopy: \n"
+ "beqz %[count], leave \n"
+ " addu $a3, %[dst], %[count] \n" // a3 = last dst address
+ "$ua_smallCopy_loop: \n"
+ "lb $v1, 0(%[src]) \n"
+ "addiu %[src], %[src], 1 \n"
+ "addiu %[dst], %[dst], 1 \n"
+ "bne %[dst],$a3,$ua_smallCopy_loop \n"
+ " sb $v1, -1(%[dst]) \n"
+
+ "j $ra \n"
+ " nop \n"
+ ".set at \n"
+ ".set reorder \n"
+ : [dst] "+r" (dst), [src] "+r" (src)
+ : [count] "r" (count)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
+ "t8", "t9", "a3", "v1", "at"
+ );
+}
+#endif // HAS_COPYROW_MIPS
+
+// MIPS DSPR2 functions
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
+ (__mips_dsp_rev >= 2)
+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lw $t1, 4(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lw $t2, 8(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lw $t3, 12(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lw $t5, 16(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lw $t6, 20(%[src_uv]) \n" // V11 | U11 | V10 | U10
+ "lw $t7, 24(%[src_uv]) \n" // V13 | U13 | V12 | U12
+ "lw $t8, 28(%[src_uv]) \n" // V15 | U15 | V14 | U14
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
+ "sw $t9, 0(%[dst_v]) \n"
+ "sw $t0, 0(%[dst_u]) \n"
+ "sw $t1, 4(%[dst_v]) \n"
+ "sw $t2, 4(%[dst_u]) \n"
+ "sw $t3, 8(%[dst_v]) \n"
+ "sw $t5, 8(%[dst_u]) \n"
+ "sw $t6, 12(%[dst_v]) \n"
+ "sw $t7, 12(%[dst_u]) \n"
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [width] "+r" (width),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6", "t7", "t8", "t9"
+ );
+}
+
+void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
+ uint8* dst_v, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "blez $t4, 2f \n"
+ " andi %[width], %[width], 0xf \n" // residual
+
+ "1: \n"
+ "addiu $t4, $t4, -1 \n"
+ "lwr $t0, 0(%[src_uv]) \n"
+ "lwl $t0, 3(%[src_uv]) \n" // V1 | U1 | V0 | U0
+ "lwr $t1, 4(%[src_uv]) \n"
+ "lwl $t1, 7(%[src_uv]) \n" // V3 | U3 | V2 | U2
+ "lwr $t2, 8(%[src_uv]) \n"
+ "lwl $t2, 11(%[src_uv]) \n" // V5 | U5 | V4 | U4
+ "lwr $t3, 12(%[src_uv]) \n"
+ "lwl $t3, 15(%[src_uv]) \n" // V7 | U7 | V6 | U6
+ "lwr $t5, 16(%[src_uv]) \n"
+ "lwl $t5, 19(%[src_uv]) \n" // V9 | U9 | V8 | U8
+ "lwr $t6, 20(%[src_uv]) \n"
+ "lwl $t6, 23(%[src_uv]) \n" // V11 | U11 | V10 | U10
+ "lwr $t7, 24(%[src_uv]) \n"
+ "lwl $t7, 27(%[src_uv]) \n" // V13 | U13 | V12 | U12
+ "lwr $t8, 28(%[src_uv]) \n"
+ "lwl $t8, 31(%[src_uv]) \n" // V15 | U15 | V14 | U14
+ "precrq.qb.ph $t9, $t1, $t0 \n" // V3 | V2 | V1 | V0
+ "precr.qb.ph $t0, $t1, $t0 \n" // U3 | U2 | U1 | U0
+ "precrq.qb.ph $t1, $t3, $t2 \n" // V7 | V6 | V5 | V4
+ "precr.qb.ph $t2, $t3, $t2 \n" // U7 | U6 | U5 | U4
+ "precrq.qb.ph $t3, $t6, $t5 \n" // V11 | V10 | V9 | V8
+ "precr.qb.ph $t5, $t6, $t5 \n" // U11 | U10 | U9 | U8
+ "precrq.qb.ph $t6, $t8, $t7 \n" // V15 | V14 | V13 | V12
+ "precr.qb.ph $t7, $t8, $t7 \n" // U15 | U14 | U13 | U12
+ "addiu %[src_uv], %[src_uv], 32 \n"
+ "swr $t9, 0(%[dst_v]) \n"
+ "swl $t9, 3(%[dst_v]) \n"
+ "swr $t0, 0(%[dst_u]) \n"
+ "swl $t0, 3(%[dst_u]) \n"
+ "swr $t1, 4(%[dst_v]) \n"
+ "swl $t1, 7(%[dst_v]) \n"
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n"
+ "swr $t3, 8(%[dst_v]) \n"
+ "swl $t3, 11(%[dst_v]) \n"
+ "swr $t5, 8(%[dst_u]) \n"
+ "swl $t5, 11(%[dst_u]) \n"
+ "swr $t6, 12(%[dst_v]) \n"
+ "swl $t6, 15(%[dst_v]) \n"
+ "swr $t7, 12(%[dst_u]) \n"
+ "swl $t7, 15(%[dst_u]) \n"
+ "addiu %[dst_u], %[dst_u], 16 \n"
+ "bgtz $t4, 1b \n"
+ " addiu %[dst_v], %[dst_v], 16 \n"
+
+ "beqz %[width], 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, 0(%[src_uv]) \n"
+ "lbu $t1, 1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], 2 \n"
+ "addiu %[width], %[width], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[width], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [width] "+r" (width),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6", "t7", "t8", "t9"
+ );
+}
+
+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t4, %[width], 4 \n" // multiplies of 16
+ "andi $t5, %[width], 0xf \n"
+ "blez $t4, 2f \n"
+ " addu %[src], %[src], %[width] \n" // src += width
+
+ "1: \n"
+ "lw $t0, -16(%[src]) \n" // |3|2|1|0|
+ "lw $t1, -12(%[src]) \n" // |7|6|5|4|
+ "lw $t2, -8(%[src]) \n" // |11|10|9|8|
+ "lw $t3, -4(%[src]) \n" // |15|14|13|12|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t1, $t1 \n" // |6|7|4|5|
+ "wsbh $t2, $t2 \n" // |10|11|8|9|
+ "wsbh $t3, $t3 \n" // |14|15|12|13|
+ "rotr $t0, $t0, 16 \n" // |0|1|2|3|
+ "rotr $t1, $t1, 16 \n" // |4|5|6|7|
+ "rotr $t2, $t2, 16 \n" // |8|9|10|11|
+ "rotr $t3, $t3, 16 \n" // |12|13|14|15|
+ "addiu %[src], %[src], -16 \n"
+ "addiu $t4, $t4, -1 \n"
+ "sw $t3, 0(%[dst]) \n" // |15|14|13|12|
+ "sw $t2, 4(%[dst]) \n" // |11|10|9|8|
+ "sw $t1, 8(%[dst]) \n" // |7|6|5|4|
+ "sw $t0, 12(%[dst]) \n" // |3|2|1|0|
+ "bgtz $t4, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+ "beqz $t5, 3f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -1(%[src]) \n"
+ "addiu $t5, $t5, -1 \n"
+ "addiu %[src], %[src], -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgez $t5, 2b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src] "+r" (src), [dst] "+r" (dst)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4", "t5"
+ );
+}
+
+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ int x = 0;
+ int y = 0;
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "addu $t4, %[width], %[width] \n"
+ "srl %[x], %[width], 4 \n"
+ "andi %[y], %[width], 0xf \n"
+ "blez %[x], 2f \n"
+ " addu %[src_uv], %[src_uv], $t4 \n"
+
+ "1: \n"
+ "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
+ "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
+ "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
+ "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
+ "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
+ "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
+ "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
+ "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
+
+ "rotr $t0, $t0, 16 \n" // |1|0|3|2|
+ "rotr $t1, $t1, 16 \n" // |5|4|7|6|
+ "rotr $t2, $t2, 16 \n" // |9|8|11|10|
+ "rotr $t3, $t3, 16 \n" // |13|12|15|14|
+ "rotr $t4, $t4, 16 \n" // |17|16|19|18|
+ "rotr $t6, $t6, 16 \n" // |21|20|23|22|
+ "rotr $t7, $t7, 16 \n" // |25|24|27|26|
+ "rotr $t8, $t8, 16 \n" // |29|28|31|30|
+ "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
+ "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
+ "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
+ "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
+ "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
+ "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
+ "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
+ "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
+ "addiu %[src_uv], %[src_uv], -32 \n"
+ "addiu %[x], %[x], -1 \n"
+ "swr $t4, 0(%[dst_u]) \n"
+ "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
+ "swr $t6, 0(%[dst_v]) \n"
+ "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
+ "swr $t2, 4(%[dst_u]) \n"
+ "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
+ "swr $t3, 4(%[dst_v]) \n"
+ "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
+ "swr $t0, 8(%[dst_u]) \n"
+ "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
+ "swr $t1, 8(%[dst_v]) \n"
+ "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
+ "swr $t9, 12(%[dst_u]) \n"
+ "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
+ "swr $t5, 12(%[dst_v]) \n"
+ "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
+ "addiu %[dst_v], %[dst_v], 16 \n"
+ "bgtz %[x], 1b \n"
+ " addiu %[dst_u], %[dst_u], 16 \n"
+ "beqz %[y], 3f \n"
+ " nop \n"
+ "b 2f \n"
+ " nop \n"
+
+ "2: \n"
+ "lbu $t0, -2(%[src_uv]) \n"
+ "lbu $t1, -1(%[src_uv]) \n"
+ "addiu %[src_uv], %[src_uv], -2 \n"
+ "addiu %[y], %[y], -1 \n"
+ "sb $t0, 0(%[dst_u]) \n"
+ "sb $t1, 0(%[dst_v]) \n"
+ "addiu %[dst_u], %[dst_u], 1 \n"
+ "bgtz %[y], 2b \n"
+ " addiu %[dst_v], %[dst_v], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_uv] "+r" (src_uv),
+ [dst_u] "+r" (dst_u),
+ [dst_v] "+r" (dst_v),
+ [x] "=&r" (x),
+ [y] "+r" (y)
+ : [width] "r" (width)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t7", "t8", "t9"
+ );
+}
+
+// Convert (4 Y and 2 VU) I422 and arrange RGB values into
+// t5 = | 0 | B0 | 0 | b0 |
+// t4 = | 0 | B1 | 0 | b1 |
+// t9 = | 0 | G0 | 0 | g0 |
+// t8 = | 0 | G1 | 0 | g1 |
+// t2 = | 0 | R0 | 0 | r0 |
+// t1 = | 0 | R1 | 0 | r1 |
+#define I422ToTransientMipsRGB \
+ "lw $t0, 0(%[y_buf]) \n" \
+ "lhu $t1, 0(%[u_buf]) \n" \
+ "lhu $t2, 0(%[v_buf]) \n" \
+ "preceu.ph.qbr $t1, $t1 \n" \
+ "preceu.ph.qbr $t2, $t2 \n" \
+ "preceu.ph.qbra $t3, $t0 \n" \
+ "preceu.ph.qbla $t0, $t0 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t3, $t3, $s4 \n" \
+ "subu.ph $t0, $t0, $s4 \n" \
+ "mul.ph $t3, $t3, $s0 \n" \
+ "mul.ph $t0, $t0, $s0 \n" \
+ "shll.ph $t4, $t1, 0x7 \n" \
+ "subu.ph $t4, $t4, $t1 \n" \
+ "mul.ph $t6, $t1, $s1 \n" \
+ "mul.ph $t1, $t2, $s2 \n" \
+ "addq_s.ph $t5, $t4, $t3 \n" \
+ "addq_s.ph $t4, $t4, $t0 \n" \
+ "shra.ph $t5, $t5, 6 \n" \
+ "shra.ph $t4, $t4, 6 \n" \
+ "addiu %[u_buf], 2 \n" \
+ "addiu %[v_buf], 2 \n" \
+ "addu.ph $t6, $t6, $t1 \n" \
+ "mul.ph $t1, $t2, $s3 \n" \
+ "addu.ph $t9, $t6, $t3 \n" \
+ "addu.ph $t8, $t6, $t0 \n" \
+ "shra.ph $t9, $t9, 6 \n" \
+ "shra.ph $t8, $t8, 6 \n" \
+ "addu.ph $t2, $t1, $t3 \n" \
+ "addu.ph $t1, $t1, $t0 \n" \
+ "shra.ph $t2, $t2, 6 \n" \
+ "shra.ph $t1, $t1, 6 \n" \
+ "subu.ph $t5, $t5, $s5 \n" \
+ "subu.ph $t4, $t4, $s5 \n" \
+ "subu.ph $t9, $t9, $s5 \n" \
+ "subu.ph $t8, $t8, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "shll_s.ph $t5, $t5, 8 \n" \
+ "shll_s.ph $t4, $t4, 8 \n" \
+ "shll_s.ph $t9, $t9, 8 \n" \
+ "shll_s.ph $t8, $t8, 8 \n" \
+ "shll_s.ph $t2, $t2, 8 \n" \
+ "shll_s.ph $t1, $t1, 8 \n" \
+ "shra.ph $t5, $t5, 8 \n" \
+ "shra.ph $t4, $t4, 8 \n" \
+ "shra.ph $t9, $t9, 8 \n" \
+ "shra.ph $t8, $t8, 8 \n" \
+ "shra.ph $t2, $t2, 8 \n" \
+ "shra.ph $t1, $t1, 8 \n" \
+ "addu.ph $t5, $t5, $s5 \n" \
+ "addu.ph $t4, $t4, $s5 \n" \
+ "addu.ph $t9, $t9, $s5 \n" \
+ "addu.ph $t8, $t8, $s5 \n" \
+ "addu.ph $t2, $t2, $s5 \n" \
+ "addu.ph $t1, $t1, $s5 \n"
+
+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128| // clipping
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
+ "1: \n"
+ I422ToTransientMipsRGB
+// Arranging into argb format
+ "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
+ "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
+ "addiu %[width], -4 \n"
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
+ "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n\t"
+ ".set noreorder \n\t"
+ "beqz %[width], 2f \n\t"
+ " repl.ph $s0, 74 \n\t" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n\t" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n\t" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n\t" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n\t" // |0|16|0|16|
+ "repl.ph $s5, 128 \n\t" // |128|128|
+ "lui $s6, 0xff00 \n\t"
+ "ori $s6, 0xff00 \n\t" // |ff|00|ff|00|
+ "1: \n"
+ I422ToTransientMipsRGB
+// Arranging into abgr format
+ "precr.qb.ph $t0, $t8, $t1 \n\t" // |G1|g1|R1|r1|
+ "precr.qb.ph $t3, $t9, $t2 \n\t" // |G0|g0|R0|r0|
+ "precrq.qb.ph $t8, $t0, $t3 \n\t" // |G1|R1|G0|R0|
+ "precr.qb.ph $t9, $t0, $t3 \n\t" // |g1|r1|g0|r0|
+
+ "precr.qb.ph $t2, $t4, $t5 \n\t" // |B1|b1|B0|b0|
+ "addiu %[width], -4 \n\t"
+ "addiu %[y_buf], 4 \n\t"
+ "preceu.ph.qbla $t1, $t2 \n\t" // |0 |B1|0 |B0|
+ "preceu.ph.qbra $t2, $t2 \n\t" // |0 |b1|0 |b0|
+ "or $t1, $t1, $s6 \n\t" // |ff|B1|ff|B0|
+ "or $t2, $t2, $s6 \n\t" // |ff|b1|ff|b0|
+ "precrq.ph.w $t0, $t2, $t9 \n\t" // |ff|b1|g1|r1|
+ "precrq.ph.w $t3, $t1, $t8 \n\t" // |ff|B1|G1|R1|
+ "sll $t9, $t9, 16 \n\t"
+ "sll $t8, $t8, 16 \n\t"
+ "packrl.ph $t2, $t2, $t9 \n\t" // |ff|b0|g0|r0|
+ "packrl.ph $t1, $t1, $t8 \n\t" // |ff|B0|G0|R0|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n\t"
+ "sw $t0, 4(%[rgb_buf]) \n\t"
+ "sw $t1, 8(%[rgb_buf]) \n\t"
+ "sw $t3, 12(%[rgb_buf]) \n\t"
+ "bnez %[width], 1b \n\t"
+ " addiu %[rgb_buf], 16 \n\t"
+ "2: \n\t"
+ ".set pop \n\t"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff \n"
+ "ori $s6, 0xff \n" // |00|ff|00|ff|
+ "1: \n"
+ I422ToTransientMipsRGB
+ // Arranging into bgra format
+ "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
+ "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
+
+ "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
+ "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
+ "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
+ "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
+ "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
+ "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
+ "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
+ "sll $t1, $t1, 16 \n"
+ "sll $t2, $t2, 16 \n"
+ "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
+ "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
+// Store results.
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
+ :[y_buf] "+r" (y_buf),
+ [u_buf] "+r" (u_buf),
+ [v_buf] "+r" (v_buf),
+ [width] "+r" (width),
+ [rgb_buf] "+r" (rgb_buf)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+// Bilinear filter 8x2 -> 8x1
+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ int y0_fraction = 256 - source_y_fraction;
+ const uint8* src_ptr1 = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "replv.ph $t0, %[y0_fraction] \n"
+ "replv.ph $t1, %[source_y_fraction] \n"
+ "1: \n"
+ "lw $t2, 0(%[src_ptr]) \n"
+ "lw $t3, 0(%[src_ptr1]) \n"
+ "lw $t4, 4(%[src_ptr]) \n"
+ "lw $t5, 4(%[src_ptr1]) \n"
+ "muleu_s.ph.qbl $t6, $t2, $t0 \n"
+ "muleu_s.ph.qbr $t7, $t2, $t0 \n"
+ "muleu_s.ph.qbl $t8, $t3, $t1 \n"
+ "muleu_s.ph.qbr $t9, $t3, $t1 \n"
+ "muleu_s.ph.qbl $t2, $t4, $t0 \n"
+ "muleu_s.ph.qbr $t3, $t4, $t0 \n"
+ "muleu_s.ph.qbl $t4, $t5, $t1 \n"
+ "muleu_s.ph.qbr $t5, $t5, $t1 \n"
+ "addq.ph $t6, $t6, $t8 \n"
+ "addq.ph $t7, $t7, $t9 \n"
+ "addq.ph $t2, $t2, $t4 \n"
+ "addq.ph $t3, $t3, $t5 \n"
+ "shra.ph $t6, $t6, 8 \n"
+ "shra.ph $t7, $t7, 8 \n"
+ "shra.ph $t2, $t2, 8 \n"
+ "shra.ph $t3, $t3, 8 \n"
+ "precr.qb.ph $t6, $t6, $t7 \n"
+ "precr.qb.ph $t2, $t2, $t3 \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[src_ptr1], %[src_ptr1], 8 \n"
+ "addiu %[dst_width], %[dst_width], -8 \n"
+ "sw $t6, 0(%[dst_ptr]) \n"
+ "sw $t2, 4(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[dst_ptr], %[dst_ptr], 8 \n"
+
+ ".set pop \n"
+ : [dst_ptr] "+r" (dst_ptr),
+ [src_ptr1] "+r" (src_ptr1),
+ [src_ptr] "+r" (src_ptr),
+ [dst_width] "+r" (dst_width)
+ : [source_y_fraction] "r" (source_y_fraction),
+ [y0_fraction] "r" (y0_fraction),
+ [src_stride] "r" (src_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+#endif // __mips_dsp_rev >= 2
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc
new file mode 100644
index 00000000000..0bb55e717be
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_neon.cc
@@ -0,0 +1,2741 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.32 {d2[0]}, [%1]! \n" \
+ "vld1.32 {d2[1]}, [%2]! \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.16 {d2[0]}, [%1]! \n" \
+ "vld1.16 {d2[1]}, [%2]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d2, d3 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vld1.8 {d3}, [%2]! \n" \
+ "vpaddl.u8 q1, q1 \n" \
+ "vrshrn.u16 d2, q1, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vmov.u8 d2, #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d3, d2 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ "vld2.8 {d0, d2}, [%0]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ "vld2.8 {d2, d3}, [%0]! \n" \
+ "vmov.u8 d0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+#define YUV422TORGB \
+ "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
+ "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
+ "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
+ "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
+ "vtrn.u8 d0, d1 \n" \
+ "vsub.s16 q0, q0, q15 \n"/* offset y */\
+ "vmul.s16 q0, q0, q14 \n" \
+ "vadd.s16 d18, d19 \n" \
+ "vqadd.s16 d20, d0, d16 \n" /* B */ \
+ "vqadd.s16 d21, d1, d16 \n" \
+ "vqadd.s16 d22, d0, d17 \n" /* R */ \
+ "vqadd.s16 d23, d1, d17 \n" \
+ "vqadd.s16 d16, d0, d18 \n" /* G */ \
+ "vqadd.s16 d17, d1, d18 \n" \
+ "vqshrun.s16 d0, q10, #6 \n" /* B */ \
+ "vqshrun.s16 d1, q11, #6 \n" /* G */ \
+ "vqshrun.s16 d2, q8, #6 \n" /* R */ \
+ "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
+ "vmovl.u8 q11, d1 \n" \
+ "vmovl.u8 q8, d2 \n" \
+ "vtrn.u8 d20, d21 \n" \
+ "vtrn.u8 d22, d23 \n" \
+ "vtrn.u8 d16, d17 \n" \
+ "vmov.u8 d21, d16 \n"
+
+static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV444
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV411
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d19, #255 \n"
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_bgra), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToABGRRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_abgr), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n"
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_raw), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#define ARGBTORGB565 \
+ "vshr.u8 d20, d20, #3 \n" /* B */ \
+ "vshr.u8 d21, d21, #2 \n" /* G */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #11 \n" /* R */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q0, q0, q10 \n" /* BGR */
+
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ ARGBTORGB565
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#define ARGBTOARGB1555 \
+ "vshr.u8 q10, q10, #3 \n" /* B */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vshr.u8 d23, d23, #7 \n" /* A */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vmovl.u8 q11, d23 \n" /* A */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #10 \n" /* R */ \
+ "vshl.u16 q11, q11, #15 \n" /* A */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q1, q10, q11 \n" /* RA */ \
+ "vorr q0, q0, q1 \n" /* BGRA */
+
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB1555
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+#define ARGBTOARGB4444 \
+ "vshr.u8 d20, d20, #4 \n" /* B */ \
+ "vbic.32 d21, d21, d4 \n" /* G */ \
+ "vshr.u8 d22, d22, #4 \n" /* R */ \
+ "vbic.32 d23, d23, d4 \n" /* A */ \
+ "vorr d0, d20, d21 \n" /* BG */ \
+ "vorr d1, d22, d23 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
+
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%5] \n"
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void YToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%3] \n"
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV400
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void I400ToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23"
+ );
+}
+
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV12
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV12
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%4] \n"
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%3] \n"
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUY2
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {d24}, [%3] \n"
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READUYVY
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst1.8 {q0}, [%1]! \n" // store U
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load U
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ :
+ "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile (
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0"
+ );
+}
+
+// TODO(fbarchard): Make fully assembler
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ SetRow_NEON(dst, v32, width << 2);
+ dst += dst_stride;
+ }
+}
+
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #16 \n" // 16 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0"
+ );
+}
+
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "r12", "q0"
+ );
+}
+
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #4 \n" // 4 pixels per loop.
+ "vrev64.32 q0, q0 \n"
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0"
+ );
+}
+
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // stride + src_uyvy
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
+ "vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(src_uv_stride), // %1
+ "+r"(dst_uv), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ asm volatile (
+ "vmov.u32 d6[0], %3 \n" // selector
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
+ "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
+ "vtrn.u32 d4, d5 \n" // combine 8 pixels
+ "vst1.8 {d4}, [%1]! \n" // store 8.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ : "r"(selector) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
+ "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
+ "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
+ "vpadd.u16 d1, d8, d9 \n" // B
+ "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
+ "vpadd.u16 d3, d10, d11 \n" // G
+ "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
+ "vpadd.u16 d5, d12, d13 \n" // R
+ "subs %3, %3, #32 \n" // 32 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
+ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
+ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
+ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
+ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
+ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): Subsample match C code.
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 4 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 4 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 4 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 4 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 4 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q3, q2, q1)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_stride_bgra), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_stride_rgba), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+ );
+}
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
+
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+ );
+}
+
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ // Attenuate 8 pixels.
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
+ "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
+ "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+ );
+}
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
+
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+ );
+}
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+ );
+}
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
+ int width) {
+ asm volatile (
+ "vld1.8 {q2}, [%2] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R coefficients s16.
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d16, d18, d20, d22}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q15, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqshrun.s16 d16, q12, #7 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #7 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #7 \n" // 16 bit to 8 bit R
+ "vst4.8 {d16, d18, d20, d22}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(matrix_argb) // %2
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_posix.cc b/chromium/third_party/libyuv/source/row_posix.cc
new file mode 100644
index 00000000000..b92a9f5c13b
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_posix.cc
@@ -0,0 +1,5409 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB
+CONST vec8 kARGBToY = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+CONST vec8 kARGBToYJ = {
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+CONST vec8 kARGBToU = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+CONST vec8 kARGBToUJ = {
+ 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+CONST vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+CONST vec8 kARGBToVJ = {
+ -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+// Constants for BGRA
+CONST vec8 kBGRAToY = {
+ 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+CONST vec8 kBGRAToU = {
+ 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+CONST vec8 kBGRAToV = {
+ 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR
+CONST vec8 kABGRToY = {
+ 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+CONST vec8 kABGRToU = {
+ -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+CONST vec8 kABGRToV = {
+ 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+CONST vec8 kRGBAToY = {
+ 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+CONST vec8 kRGBAToU = {
+ 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+CONST vec8 kRGBAToV = {
+ 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+CONST uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+CONST vec16 kAddYJ64 = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+CONST uvec8 kAddUV128 = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+CONST uvec16 kAddUVJ128 = {
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
+// Shuffle table for converting RGB24 to ARGB.
+CONST uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+CONST uvec8 kShuffleMaskRAWToARGB = {
+ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+CONST uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+CONST uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+CONST uvec8 kShuffleMaskARGBToRAW_0 = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskRGB24ToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
+ "pslld $0x18,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm3 \n"
+ "lea 0x30(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm2 \n"
+ "palignr $0x8,%%xmm1,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "por %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "palignr $0x4,%%xmm3,%%xmm3 \n"
+ "pshufb %%xmm4,%%xmm3 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "por %%xmm5,%%xmm3 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm3,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskRAWToARGB) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x20802080,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xa,%%xmm4 \n"
+ "psrlw $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,(%1,%0,2) \n"
+ "movdqa %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0x1080108,%%eax \n"
+ "movd %%eax,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "mov $0x42004200,%%eax \n"
+ "movd %%eax,%%xmm6 \n"
+ "pshufd $0x0,%%xmm6,%%xmm6 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psllw $0xb,%%xmm3 \n"
+ "movdqa %%xmm3,%%xmm4 \n"
+ "psrlw $0x6,%%xmm4 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psllw $0x8,%%xmm7 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psllw $0x1,%%xmm1 \n"
+ "psllw $0xb,%%xmm2 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "psllw $0x8,%%xmm1 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "pmulhuw %%xmm6,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,(%1,%0,2) \n"
+ "movdqa %%xmm2,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "mov $0xf0f0f0f,%%eax \n"
+ "movd %%eax,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x4,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "psllw $0x4,%%xmm1 \n"
+ "psrlw $0x4,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%1,%0,2) \n"
+ "movdqa %%xmm1,0x10(%1,%0,2) \n"
+ "lea 0x10(%0),%0 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskARGBToRGB24) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm6 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "pshufb %%xmm6,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm6,%%xmm2 \n"
+ "pshufb %%xmm6,%%xmm3 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "psrldq $0x4,%%xmm1 \n"
+ "pslldq $0xc,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm5 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pslldq $0x8,%%xmm5 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "por %%xmm5,%%xmm1 \n"
+ "psrldq $0x8,%%xmm2 \n"
+ "pslldq $0x4,%%xmm3 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "movdqa %%xmm2,0x20(%1) \n"
+ "lea 0x30(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ : "m"(kShuffleMaskARGBToRAW) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "psrld $0x1b,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1a,%%xmm4 \n"
+ "pslld $0x5,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0xb,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "pslld $0x8,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x5,%%xmm2 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "pand %%xmm3,%%xmm1 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psrld $0x1b,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "pslld $0x5,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "pslld $0xa,%%xmm6 \n"
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "pslld $0xf,%%xmm7 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "psrad $0x10,%%xmm0 \n"
+ "psrld $0x3,%%xmm1 \n"
+ "psrld $0x6,%%xmm2 \n"
+ "psrld $0x9,%%xmm3 \n"
+ "pand %%xmm7,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm6,%%xmm3 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "por %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0xc,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm3 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm3,%%xmm0 \n"
+ "pand %%xmm4,%%xmm1 \n"
+ "psrlq $0x4,%%xmm0 \n"
+ "psrlq $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x4,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+// TODO(fbarchard): pass xmm constants to single block of assembly.
+// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
+// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
+// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
+// and considered unsafe.
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToUJ), // %0
+ "m"(kARGBToVJ), // %1
+ "m"(kAddUVJ128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToUJ), // %0
+ "m"(kARGBToVJ), // %1
+ "m"(kAddUVJ128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6"
+#endif
+ );
+}
+
+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
+ uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm2 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm2 \n"
+ "packsswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqu %%xmm0,(%1,%2,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6"
+#endif
+ );
+}
+
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kARGBToU), // %0
+ "m"(kARGBToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kBGRAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kBGRAToU), // %0
+ "m"(kBGRAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_bgra))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kBGRAToU), // %0
+ "m"(kBGRAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_bgra0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_bgra))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kABGRToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
+ asm volatile (
+ "movdqa %4,%%xmm5 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm3 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm4,%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "phaddw %%xmm3,%%xmm2 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ : "m"(kRGBAToY), // %3
+ "m"(kAddY16) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kABGRToU), // %0
+ "m"(kABGRToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_abgr))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kABGRToU), // %0
+ "m"(kABGRToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_abgr0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_abgr))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kRGBAToU), // %0
+ "m"(kRGBAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm6 \n"
+ "pavgb (%0,%4,1),%%xmm0 \n"
+ "pavgb 0x10(%0,%4,1),%%xmm1 \n"
+ "pavgb 0x20(%0,%4,1),%%xmm2 \n"
+ "pavgb 0x30(%0,%4,1),%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_rgba))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) {
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kRGBAToU), // %0
+ "m"(kRGBAToV), // %1
+ "m"(kAddUV128) // %2
+ );
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu 0x20(%0),%%xmm2 \n"
+ "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu (%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm1 \n"
+ "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "lea 0x40(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm7 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm7 \n"
+ "shufps $0x88,%%xmm6,%%xmm2 \n"
+ "shufps $0xdd,%%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm2 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
+ "phaddw %%xmm2,%%xmm0 \n"
+ "phaddw %%xmm6,%%xmm1 \n"
+ "psraw $0x8,%%xmm0 \n"
+ "psraw $0x8,%%xmm1 \n"
+ "packsswb %%xmm1,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movlps %%xmm0,(%1) \n"
+ "movhps %%xmm0,(%1,%2,1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_rgba0), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+rm"(width) // %3
+ : "r"(static_cast<intptr_t>(src_stride_rgba))
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+struct {
+ vec8 kUVToB; // 0
+ vec8 kUVToG; // 16
+ vec8 kUVToR; // 32
+ vec16 kUVBiasB; // 48
+ vec16 kUVBiasG; // 64
+ vec16 kUVBiasR; // 80
+ vec16 kYSub16; // 96
+ vec16 kYToRgb; // 112
+ vec8 kVUToB; // 128
+ vec8 kVUToG; // 144
+ vec8 kVUToR; // 160
+} CONST SIMD_ALIGNED(kYuvConstants) = {
+ { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
+ { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
+ { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
+ { BB, BB, BB, BB, BB, BB, BB, BB },
+ { BG, BG, BG, BG, BG, BG, BG, BG },
+ { BR, BR, BR, BR, BR, BR, BR, BR },
+ { 16, 16, 16, 16, 16, 16, 16, 16 },
+ { YG, YG, YG, YG, YG, YG, YG, YG },
+ { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
+ { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
+ { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
+};
+
+
+// Read 8 UV from 411
+#define READYUV444 \
+ "movq (%[u_buf]),%%xmm0 \n" \
+ "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x8(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+
+// Read 4 UV from 422, upsample to 8 UV
+#define READYUV422 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x4(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+
+// Read 2 UV from 411, upsample to 8 UV
+#define READYUV411 \
+ "movd (%[u_buf]),%%xmm0 \n" \
+ "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
+ "lea 0x2(%[u_buf]),%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpckldq %%xmm0,%%xmm0 \n" \
+
+// Read 4 UV from NV12, upsample to 8 UV
+#define READNV12 \
+ "movq (%[uv_buf]),%%xmm0 \n" \
+ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+
+// Convert 8 pixels: 8 UV and 8 Y
+#define YUVTORGB \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
+ "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
+ "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
+ "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
+ "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
+ "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
+ "movq (%[y_buf]),%%xmm3 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
+ "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n" \
+
+// Convert 8 pixels: 8 VU and 8 Y
+#define YVUTORGB \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
+ "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
+ "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
+ "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
+ "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
+ "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
+ "movq (%[y_buf]),%%xmm3 \n" \
+ "lea 0x8(%[y_buf]),%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
+ "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n" \
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV444
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[dst_argb]) \n"
+ "movdqa %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ int width) {
+// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
+#if defined(__i386__)
+ asm volatile (
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+ :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
+#endif
+
+ asm volatile (
+#if !defined(__i386__)
+ "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
+#endif
+ "sub %[u_buf],%[v_buf] \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0,(%[dst_rgb24]) \n"
+ "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
+ "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
+#if !defined(__i386__)
+ , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
+ [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
+#endif
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_raw,
+ int width) {
+// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
+#if defined(__i386__)
+ asm volatile (
+ "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
+ :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+ [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
+#endif
+
+ asm volatile (
+#if !defined(__i386__)
+ "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
+ "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
+#endif
+ "sub %[u_buf],%[v_buf] \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm6,%%xmm1 \n"
+ "palignr $0xc,%%xmm0,%%xmm1 \n"
+ "movq %%xmm0,(%[dst_raw]) \n"
+ "movdqu %%xmm1,0x8(%[dst_raw]) \n"
+ "lea 0x18(%[dst_raw]),%[dst_raw] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_raw]"+r"(dst_raw), // %[dst_raw]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
+#if !defined(__i386__)
+ , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
+ [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
+#endif
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[dst_argb]) \n"
+ "movdqa %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV411
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[dst_argb]) \n"
+ "movdqa %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[dst_argb]) \n"
+ "movdqa %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YVUTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,(%[dst_argb]) \n"
+ "movdqa %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV444
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[dst_argb]) \n"
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[dst_argb]) \n"
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV411
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[dst_argb]) \n"
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[dst_argb]) \n"
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READNV12
+ YVUTORGB
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpckhwd %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%[dst_argb]) \n"
+ "movdqu %%xmm1,0x10(%[dst_argb]) \n"
+ "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [uv_buf]"+r"(uv_buf), // %[uv_buf]
+ [dst_argb]"+r"(dst_argb), // %[dst_argb]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm5,(%[dst_bgra]) \n"
+ "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
+ "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,(%[dst_abgr]) \n"
+ "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
+ "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm5,(%[dst_rgba]) \n"
+ "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
+ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm5,(%[dst_bgra]) \n"
+ "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
+ "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm2 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,(%[dst_abgr]) \n"
+ "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
+ "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ asm volatile (
+ "sub %[u_buf],%[v_buf] \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ READYUV422
+ YUVTORGB
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm2,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "movdqa %%xmm5,%%xmm0 \n"
+ "punpcklwd %%xmm1,%%xmm5 \n"
+ "punpckhwd %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm5,(%[dst_rgba]) \n"
+ "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
+ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
+ "sub $0x8,%[width] \n"
+ "jg 1b \n"
+ : [y_buf]"+r"(y_buf), // %[y_buf]
+ [u_buf]"+r"(u_buf), // %[u_buf]
+ [v_buf]"+r"(v_buf), // %[v_buf]
+ [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
+ [width]"+rm"(width) // %[width]
+ : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+void YToARGBRow_SSE2(const uint8* y_buf,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "mov $0x00100010,%%eax \n"
+ "movd %%eax,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "mov $0x004a004a,%%eax \n"
+ "movd %%eax,%%xmm2 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ "movq (%0),%%xmm0 \n"
+ "lea 0x8(%0),%0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "psubusw %%xmm3,%%xmm0 \n"
+ "pmullw %%xmm2,%%xmm0 \n"
+ "psrlw $6, %%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+
+ // Step 2: Weave into ARGB
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm0,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "por %%xmm4,%%xmm1 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,16(%1) \n"
+ "lea 32(%1),%1 \n"
+
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(y_buf), // %0
+ "+r"(dst_argb), // %1
+ "+rm"(width) // %2
+ :
+ : "memory", "cc", "eax"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+#endif // HAS_YTOARGBROW_SSE2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kShuffleMirror = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0,%2),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kShuffleMirror) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_SSE2
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0,%2),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "psllw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm1,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufd $0x4e,%%xmm0,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+CONST uvec8 kShuffleMirrorUV = {
+ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+ int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "movdqa %4,%%xmm1 \n"
+ "lea -16(%0,%3,2),%0 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "lea -16(%0),%0 \n"
+ "pshufb %%xmm1,%%xmm0 \n"
+ "sub $8,%3 \n"
+ "movlpd %%xmm0,(%1) \n"
+ "movhpd %%xmm0,(%1,%2) \n"
+ "lea 8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(temp_width) // %3
+ : "m"(kShuffleMirrorUV) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+CONST uvec8 kARGBShuffleMirror = {
+ 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ intptr_t temp_width = static_cast<intptr_t>(width);
+ asm volatile (
+ "movdqa %3,%%xmm5 \n"
+ "lea -0x10(%0),%0 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0,%2,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(temp_width) // %2
+ : "m"(kARGBShuffleMirror) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMIRRORROW_SSSE3
+
+#ifdef HAS_SPLITUVROW_SSE2
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm2,(%1,%2) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm2,(%1,%2) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_SSE2
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "movdqa %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+
+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+ uint8* dst_uv, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm2 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "movdqu %%xmm2,0x10(%2) \n"
+ "lea 0x20(%2),%2 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa %%xmm0,(%0,%1) \n"
+ "movdqa %%xmm1,0x10(%0,%1) \n"
+ "lea 0x20(%0),%0 \n"
+ "sub $0x20,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_X86
+void CopyRow_X86(const uint8* src, uint8* dst, int width) {
+ size_t width_tmp = static_cast<size_t>(width);
+ asm volatile (
+ "shr $0x2,%2 \n"
+ "rep movsl \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc"
+ );
+}
+#endif // HAS_COPYROW_X86
+
+// Unaligned Multiple of 1.
+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+ size_t width_tmp = static_cast<size_t>(width);
+ asm volatile (
+ "rep movsb \n"
+ : "+S"(src), // %0
+ "+D"(dst), // %1
+ "+c"(width_tmp) // %2
+ :
+ : "memory", "cc"
+ );
+}
+
+#ifdef HAS_SETROW_X86
+void SetRow_X86(uint8* dst, uint32 v32, int width) {
+ size_t width_tmp = static_cast<size_t>(width);
+ asm volatile (
+ "shr $0x2,%1 \n"
+ "rep stosl \n"
+ : "+D"(dst), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+}
+
+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ size_t width_tmp = static_cast<size_t>(width);
+ uint32* d = reinterpret_cast<uint32*>(dst);
+ asm volatile (
+ "rep stosl \n"
+ : "+D"(d), // %0
+ "+c"(width_tmp) // %1
+ : "a"(v32) // %2
+ : "memory", "cc");
+ dst += dst_stride;
+ }
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_SSE2
+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
+ int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%4,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%4,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,(%1,%2) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x10,%3 \n"
+ "jg 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x1,%3 \n"
+ "je 91f \n"
+ "jl 99f \n"
+
+ // 1 pixel loop until destination pointer is aligned.
+ "10: \n"
+ "test $0xf,%2 \n"
+ "je 19f \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ "add $1-4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "41: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 41b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "psrlw $0x8,%%xmm3 \n"
+ "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
+ "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+CONST uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+
+// Blend 8 pixels at a time
+// Shuffle table for reversing the bytes.
+
+// Same as SSE2, but replaces
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3,0F5h // 8 alpha words
+// pshuflw xmm3, xmm3,0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
+
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0xf,%%xmm7 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x8,%%xmm6 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psllw $0x8,%%xmm5 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "sub $0x1,%3 \n"
+ "je 91f \n"
+ "jl 99f \n"
+
+ // 1 pixel loop until destination pointer is aligned.
+ "10: \n"
+ "test $0xf,%2 \n"
+ "je 19f \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ "add $1-4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%0 \n"
+ "jne 41f \n"
+ "test $0xf,%1 \n"
+ "jne 41f \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "40: \n"
+ "movdqa (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqa (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqa (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 40b \n"
+ "jmp 49f \n"
+
+ // 4 pixel unaligned loop.
+ ".p2align 2 \n"
+ "41: \n"
+ "movdqu (%0),%%xmm3 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movdqu (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movdqu (%1),%%xmm1 \n"
+ "lea 0x10(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 41b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 99f \n"
+
+ // 1 pixel loop.
+ "91: \n"
+ "movd (%0),%%xmm3 \n"
+ "lea 0x4(%0),%0 \n"
+ "movdqa %%xmm3,%%xmm0 \n"
+ "pxor %%xmm4,%%xmm3 \n"
+ "movd (%1),%%xmm2 \n"
+ "pshufb %4,%%xmm3 \n"
+ "pand %%xmm6,%%xmm2 \n"
+ "paddw %%xmm7,%%xmm3 \n"
+ "pmullw %%xmm3,%%xmm2 \n"
+ "movd (%1),%%xmm1 \n"
+ "lea 0x4(%1),%1 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "por %%xmm4,%%xmm0 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "psrlw $0x8,%%xmm2 \n"
+ "paddusb %%xmm2,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x1,%3 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 91b \n"
+ "99: \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "m"(kShuffleAlpha) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "pslld $0x18,%%xmm4 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x8,%%xmm5 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "pshufhw $0xff,%%xmm0,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pshufhw $0xff,%%xmm1,%%xmm2 \n"
+ "pshuflw $0xff,%%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm4,%%xmm2 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha
+CONST uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+CONST uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+// Attenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm3,%%xmm3 \n"
+ "pslld $0x18,%%xmm3 \n"
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "punpcklbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm1,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "punpckhbw %%xmm2,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "pand %%xmm3,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kShuffleAlpha0), // %3
+ "m"(kShuffleAlpha1) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// aligned to 16 bytes
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ uintptr_t alpha = 0;
+ asm volatile (
+ "sub %0,%1 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movzb 0x3(%0),%3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "movd 0x0(%4,%3,4),%%xmm2 \n"
+ "movzb 0x7(%0),%3 \n"
+ "movd 0x0(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "movzb 0xb(%0),%3 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "movd 0x0(%4,%3,4),%%xmm2 \n"
+ "movzb 0xf(%0),%3 \n"
+ "movd 0x0(%4,%3,4),%%xmm3 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "movlhps %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width), // %2
+ "+r"(alpha) // %3
+ : "r"(fixed_invtbl8) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ "sub %0,%1 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm0 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa 0x10(%0),%%xmm3 \n"
+ "psrld $0x18,%%xmm2 \n"
+ "psrld $0x18,%%xmm3 \n"
+ "packuswb %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpcklbw %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "movdqa %%xmm1,0x10(%0,%1,1) \n"
+ "lea 0x20(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone
+CONST vec8 kARGBToSepiaB = {
+ 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+CONST vec8 kARGBToSepiaG = {
+ 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+CONST vec8 kARGBToSepiaR = {
+ 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ asm volatile (
+ "movdqa %2,%%xmm2 \n"
+ "movdqa %3,%%xmm3 \n"
+ "movdqa %4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "phaddw %%xmm6,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddw %%xmm1,%%xmm5 \n"
+ "psrlw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "sub $0x8,%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "movdqa %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "m"(kARGBToSepiaB), // %2
+ "m"(kARGBToSepiaG), // %3
+ "m"(kARGBToSepiaR) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+ int width) {
+ asm volatile (
+ "movd (%2),%%xmm2 \n"
+ "movd 0x4(%2),%%xmm3 \n"
+ "movd 0x8(%2),%%xmm4 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "pmaddubsw %%xmm2,%%xmm6 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm3,%%xmm1 \n"
+ "phaddsw %%xmm6,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm5 \n"
+ "psraw $0x7,%%xmm0 \n"
+ "psraw $0x7,%%xmm5 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "movdqa (%0),%%xmm5 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "pmaddubsw %%xmm4,%%xmm5 \n"
+ "pmaddubsw %%xmm4,%%xmm1 \n"
+ "phaddsw %%xmm1,%%xmm5 \n"
+ "psraw $0x7,%%xmm5 \n"
+ "packuswb %%xmm5,%%xmm5 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "psrld $0x18,%%xmm6 \n"
+ "psrld $0x18,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm6,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "punpckhwd %%xmm5,%%xmm1 \n"
+ "sub $0x8,%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "movdqa %%xmm1,0x10(%0) \n"
+ "lea 0x20(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(matrix_argb) // %2
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// aligned to 16 bytes
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "movd %2,%%xmm2 \n"
+ "movd %3,%%xmm3 \n"
+ "movd %4,%%xmm4 \n"
+ "pshuflw $0x40,%%xmm2,%%xmm2 \n"
+ "pshufd $0x44,%%xmm2,%%xmm2 \n"
+ "pshuflw $0x40,%%xmm3,%%xmm3 \n"
+ "pshufd $0x44,%%xmm3,%%xmm3 \n"
+ "pshuflw $0x40,%%xmm4,%%xmm4 \n"
+ "pshufd $0x44,%%xmm4,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "pslld $0x18,%%xmm6 \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "movdqa (%0),%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "pmullw %%xmm3,%%xmm0 \n"
+ "movdqa (%0),%%xmm7 \n"
+ "pmullw %%xmm3,%%xmm1 \n"
+ "pand %%xmm6,%%xmm7 \n"
+ "paddw %%xmm4,%%xmm0 \n"
+ "paddw %%xmm4,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "por %%xmm7,%%xmm0 \n"
+ "sub $0x4,%1 \n"
+ "movdqa %%xmm0,(%0) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "movd %3,%%xmm2 \n"
+ "sub %0,%1 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm2 \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm2,%%xmm1 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%0,%1,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%1),%%xmm2 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpckhbw %%xmm5,%%xmm3 \n"
+ "pmulhuw %%xmm2,%%xmm0 \n"
+ "pmulhuw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0,(%0,%2,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%1),%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0,(%0,%2,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+
+ // 4 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu (%0,%1),%%xmm1 \n"
+ "psubusb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0,(%0,%2,1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_SOBELXROW_SSSE3
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "sub %0,%3 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq 0x2(%0),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq (%0,%1,1),%%xmm1 \n"
+ "movq 0x2(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq (%0,%2,1),%%xmm2 \n"
+ "movq 0x2(%0,%2,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pabsw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "sub $0x8,%4 \n"
+ "movq %%xmm0,(%0,%3,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SOBELXROW_SSSE3
+
+#ifdef HAS_SOBELYROW_SSSE3
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "sub %0,%2 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%1,1),%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "psubw %%xmm1,%%xmm0 \n"
+ "movq 0x1(%0),%%xmm1 \n"
+ "movq 0x1(%0,%1,1),%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "psubw %%xmm2,%%xmm1 \n"
+ "movq 0x2(%0),%%xmm2 \n"
+ "movq 0x2(%0,%1,1),%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "psubw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "paddw %%xmm1,%%xmm0 \n"
+ "pabsw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "sub $0x8,%3 \n"
+ "movq %%xmm0,(%0,%2,1) \n"
+ "lea 0x8(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SOBELYROW_SSSE3
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "punpcklbw %%xmm0,%%xmm2 \n"
+ "punpckhbw %%xmm0,%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "punpcklwd %%xmm2,%%xmm1 \n"
+ "punpckhwd %%xmm2,%%xmm2 \n"
+ "por %%xmm5,%%xmm1 \n"
+ "por %%xmm5,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklwd %%xmm0,%%xmm3 \n"
+ "punpckhwd %%xmm0,%%xmm0 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "por %%xmm5,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm1,(%2) \n"
+ "movdqa %%xmm2,0x10(%2) \n"
+ "movdqa %%xmm3,0x20(%2) \n"
+ "movdqa %%xmm0,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+
+ // 8 pixel loop.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%1,1),%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "paddusb %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm3 \n"
+ "punpcklbw %%xmm5,%%xmm3 \n"
+ "punpckhbw %%xmm5,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm4 \n"
+ "punpcklbw %%xmm2,%%xmm4 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "punpcklwd %%xmm3,%%xmm6 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "punpcklwd %%xmm0,%%xmm7 \n"
+ "punpckhwd %%xmm0,%%xmm1 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm6,(%2) \n"
+ "movdqa %%xmm4,0x10(%2) \n"
+ "movdqa %%xmm7,0x20(%2) \n"
+ "movdqa %%xmm1,0x30(%2) \n"
+ "lea 0x40(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value, inclusive of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+ "test $0xf,%1 \n"
+ "jne 49f \n"
+
+ // 4 pixel loop \n"
+ ".p2align 2 \n"
+ "40: \n"
+ "movdqu (%0),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "punpckhwd %%xmm1,%%xmm3 \n"
+ "punpckhbw %%xmm1,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "punpcklwd %%xmm1,%%xmm4 \n"
+ "punpckhwd %%xmm1,%%xmm5 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqa (%1,%2,1),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "movdqa 0x10(%1,%2,1),%%xmm3 \n"
+ "paddd %%xmm0,%%xmm3 \n"
+ "paddd %%xmm4,%%xmm0 \n"
+ "movdqa 0x20(%1,%2,1),%%xmm4 \n"
+ "paddd %%xmm0,%%xmm4 \n"
+ "paddd %%xmm5,%%xmm0 \n"
+ "movdqa 0x30(%1,%2,1),%%xmm5 \n"
+ "paddd %%xmm0,%%xmm5 \n"
+ "movdqa %%xmm2,(%1) \n"
+ "movdqa %%xmm3,0x10(%1) \n"
+ "movdqa %%xmm4,0x20(%1) \n"
+ "movdqa %%xmm5,0x30(%1) \n"
+ "lea 0x40(%1),%1 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ ".p2align 2 \n"
+ "10: \n"
+ "movd (%0),%%xmm2 \n"
+ "lea 0x4(%0),%0 \n"
+ "punpcklbw %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "movdqu (%1,%2,1),%%xmm2 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm2,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+
+ "19: \n"
+ : "+r"(row), // %0
+ "+r"(cumsum), // %1
+ "+r"(previous_cumsum), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst,
+ int count) {
+ asm volatile (
+ "movd %5,%%xmm4 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "rcpss %%xmm4,%%xmm4 \n"
+ "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "sub $0x4,%3 \n"
+ "jl 49f \n"
+
+ // 4 pixel loop \n"
+ ".p2align 2 \n"
+ "40: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa 0x20(%0),%%xmm2 \n"
+ "movdqa 0x30(%0),%%xmm3 \n"
+ "psubd (%0,%4,4),%%xmm0 \n"
+ "psubd 0x10(%0,%4,4),%%xmm1 \n"
+ "psubd 0x20(%0,%4,4),%%xmm2 \n"
+ "psubd 0x30(%0,%4,4),%%xmm3 \n"
+ "lea 0x40(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "psubd 0x10(%1),%%xmm1 \n"
+ "psubd 0x20(%1),%%xmm2 \n"
+ "psubd 0x30(%1),%%xmm3 \n"
+ "paddd (%1,%4,4),%%xmm0 \n"
+ "paddd 0x10(%1,%4,4),%%xmm1 \n"
+ "paddd 0x20(%1,%4,4),%%xmm2 \n"
+ "paddd 0x30(%1,%4,4),%%xmm3 \n"
+ "lea 0x40(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm1,%%xmm1 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm1 \n"
+ "cvtdq2ps %%xmm2,%%xmm2 \n"
+ "cvtdq2ps %%xmm3,%%xmm3 \n"
+ "mulps %%xmm4,%%xmm2 \n"
+ "mulps %%xmm4,%%xmm3 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "cvtps2dq %%xmm1,%%xmm1 \n"
+ "cvtps2dq %%xmm2,%%xmm2 \n"
+ "cvtps2dq %%xmm3,%%xmm3 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%3 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ ".p2align 2 \n"
+ "10: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "psubd (%0,%4,4),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "psubd (%1),%%xmm0 \n"
+ "paddd (%1,%4,4),%%xmm0 \n"
+ "lea 0x10(%1),%1 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "mulps %%xmm4,%%xmm0 \n"
+ "cvtps2dq %%xmm0,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "sub $0x1,%3 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(topleft), // %0
+ "+r"(botleft), // %1
+ "+r"(dst), // %2
+ "+rm"(count) // %3
+ : "r"(static_cast<intptr_t>(width)), // %4
+ "rm"(area) // %5
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// TODO(fbarchard): Find 64 bit way to avoid masking.
+// Copy ARGB pixels from source image with slope to a row of destination.
+// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
+// an error if movq is used. movd %%xmm0,%1
+
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* src_dudv, int width) {
+ intptr_t src_argb_stride_temp = src_argb_stride;
+ intptr_t temp = 0;
+ asm volatile (
+ "movq (%3),%%xmm2 \n"
+ "movq 0x8(%3),%%xmm7 \n"
+ "shl $0x10,%1 \n"
+ "add $0x4,%1 \n"
+ "movd %1,%%xmm5 \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+
+ "pshufd $0x44,%%xmm7,%%xmm7 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "addps %%xmm7,%%xmm0 \n"
+ "movlhps %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm7,%%xmm4 \n"
+ "addps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "addps %%xmm4,%%xmm4 \n"
+
+ // 4 pixel loop \n"
+ ".p2align 4 \n"
+ "40: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "cvttps2dq %%xmm3,%%xmm1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+#if defined(__x86_64__)
+ "movd %%xmm0,%1 \n"
+ "mov %1,%5 \n"
+ "and $0x0fffffff,%1 \n"
+ "shr $32,%5 \n"
+ "pshufd $0xEE,%%xmm0,%%xmm0 \n"
+#else
+ "movd %%xmm0,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%5 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+#endif
+ "movd (%0,%1,1),%%xmm1 \n"
+ "movd (%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm1 \n"
+ "addps %%xmm4,%%xmm2 \n"
+ "movq %%xmm1,(%2) \n"
+#if defined(__x86_64__)
+ "movd %%xmm0,%1 \n"
+ "mov %1,%5 \n"
+ "and $0x0fffffff,%1 \n"
+ "shr $32,%5 \n"
+#else
+ "movd %%xmm0,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%5 \n"
+#endif
+ "movd (%0,%1,1),%%xmm0 \n"
+ "movd (%0,%5,1),%%xmm6 \n"
+ "punpckldq %%xmm6,%%xmm0 \n"
+ "addps %%xmm4,%%xmm3 \n"
+ "sub $0x4,%4 \n"
+ "movq %%xmm0,0x08(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "add $0x3,%4 \n"
+ "jl 19f \n"
+
+ // 1 pixel loop \n"
+ ".p2align 4 \n"
+ "10: \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n"
+ "packssdw %%xmm0,%%xmm0 \n"
+ "pmaddwd %%xmm5,%%xmm0 \n"
+ "addps %%xmm7,%%xmm2 \n"
+ "movd %%xmm0,%1 \n"
+#if defined(__x86_64__)
+ "and $0x0fffffff,%1 \n"
+#endif
+ "movd (%0,%1,1),%%xmm0 \n"
+ "sub $0x1,%4 \n"
+ "movd %%xmm0,(%2) \n"
+ "lea 0x4(%2),%2 \n"
+ "jge 10b \n"
+ "19: \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_argb_stride_temp), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_dudv), // %3
+ "+rm"(width), // %4
+ "+r"(temp) // %5
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+
+ // General purpose row blend.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ ".p2align 4 \n"
+ "25: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ ".p2align 4 \n"
+ "50: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ ".p2align 4 \n"
+ "75: \n"
+ "movdqa (%1),%%xmm1 \n"
+ "movdqa (%1,%4,1),%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ ".p2align 4 \n"
+ "100: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+ );
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm2 \n"
+ "psubw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm2 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "pmulhw %%xmm5,%%xmm2 \n"
+ "pmulhw %%xmm5,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ ".p2align 4 \n"
+ "25: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ ".p2align 4 \n"
+ "50: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "movdqa (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ ".p2align 4 \n"
+ "75: \n"
+ "movdqa (%1),%%xmm1 \n"
+ "movdqa (%1,%4,1),%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ ".p2align 4 \n"
+ "100: \n"
+ "movdqa (%1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+
+ // General purpose row blend.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm2 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "pmaddubsw %%xmm5,%%xmm0 \n"
+ "pmaddubsw %%xmm5,%%xmm1 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "psrlw $0x7,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ ".p2align 4 \n"
+ "25: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ ".p2align 4 \n"
+ "50: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ ".p2align 4 \n"
+ "75: \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%1,%4,1),%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ ".p2align 4 \n"
+ "100: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm5"
+#endif
+ );
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ asm volatile (
+ "sub %1,%0 \n"
+ "shr %3 \n"
+ "cmp $0x0,%3 \n"
+ "je 100f \n"
+ "cmp $0x20,%3 \n"
+ "je 75f \n"
+ "cmp $0x40,%3 \n"
+ "je 50f \n"
+ "cmp $0x60,%3 \n"
+ "je 25f \n"
+
+ "movd %3,%%xmm0 \n"
+ "neg %3 \n"
+ "add $0x80,%3 \n"
+ "movd %3,%%xmm5 \n"
+ "punpcklbw %%xmm0,%%xmm5 \n"
+ "punpcklwd %%xmm5,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pxor %%xmm4,%%xmm4 \n"
+
+ // General purpose row blend.
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm2 \n"
+ "movdqu %%xmm0,%%xmm1 \n"
+ "movdqu %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm2 \n"
+ "psubw %%xmm1,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm2 \n"
+ "paddw %%xmm3,%%xmm3 \n"
+ "pmulhw %%xmm5,%%xmm2 \n"
+ "pmulhw %%xmm5,%%xmm3 \n"
+ "paddw %%xmm2,%%xmm0 \n"
+ "paddw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ // Blend 25 / 75.
+ ".p2align 4 \n"
+ "25: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 25b \n"
+ "jmp 99f \n"
+
+ // Blend 50 / 50.
+ ".p2align 4 \n"
+ "50: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "movdqu (%1,%4,1),%%xmm1 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 50b \n"
+ "jmp 99f \n"
+
+ // Blend 75 / 25.
+ ".p2align 4 \n"
+ "75: \n"
+ "movdqu (%1),%%xmm1 \n"
+ "movdqu (%1,%4,1),%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "pavgb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 75b \n"
+ "jmp 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ ".p2align 4 \n"
+ "100: \n"
+ "movdqu (%1),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqu %%xmm0,(%1,%0,1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(source_y_fraction) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ asm volatile (
+ "sub %0,%1 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pavgb (%0,%3),%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm0,(%0,%1) \n"
+ "lea 0x10(%0),%0 \n"
+ "jg 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_uv), // %1
+ "+r"(pix) // %2
+ : "r"(static_cast<intptr_t>(src_uv_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0"
+#endif
+ );
+}
+
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ asm volatile (
+ "movd %3,%%xmm5 \n"
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ : "g"(selector) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "movdqa (%3),%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "movdqa (%3),%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm5,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "movdqu %%xmm1,0x10(%1) \n"
+ "lea 0x20(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq (%1,%2,1),%%xmm3 \n"
+ "lea 0x8(%1),%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm2,%%xmm0 \n"
+ "punpckhbw %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm0,(%3) \n"
+ "movdqu %%xmm1,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ asm volatile (
+ "sub %1,%2 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%1),%%xmm2 \n"
+ "movq (%1,%2,1),%%xmm3 \n"
+ "lea 0x8(%1),%1 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "lea 0x10(%0),%0 \n"
+ "punpcklbw %%xmm0,%%xmm1 \n"
+ "punpckhbw %%xmm0,%%xmm2 \n"
+ "movdqu %%xmm1,(%3) \n"
+ "movdqu %%xmm2,0x10(%3) \n"
+ "lea 0x20(%3),%3 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_frame), // %3
+ "+rm"(width) // %4
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_win.cc b/chromium/third_party/libyuv/source/row_win.cc
new file mode 100644
index 00000000000..4ea06923def
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_win.cc
@@ -0,0 +1,6610 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef HAS_ARGBTOYROW_SSSE3
+
+// Constants for ARGB.
+static const vec8 kARGBToY = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+// JPeg full range.
+static const vec8 kARGBToYJ = {
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+static const lvec8 kARGBToY_AVX = {
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
+ 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
+};
+
+static const lvec8 kARGBToYJ_AVX = {
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0,
+ 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
+};
+
+static const vec8 kARGBToU = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const vec8 kARGBToUJ = {
+ 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
+};
+
+// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
+static const lvec8 kARGBToU_AVX = {
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
+ 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
+};
+
+static const vec8 kARGBToV = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+};
+
+static const vec8 kARGBToVJ = {
+ -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
+};
+
+static const lvec8 kARGBToV_AVX = {
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+ -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
+};
+
+// vpermd for vphaddw + vpackuswb vpermd.
+static const lvec32 kShufARGBToY_AVX = {
+ 0, 4, 1, 5, 2, 6, 3, 7
+};
+
+// vpshufb for vphaddw + vpackuswb packed to shorts.
+static const lvec8 kShufARGBToUV_AVX = {
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+};
+
+// Constants for BGRA.
+static const vec8 kBGRAToY = {
+ 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
+};
+
+static const vec8 kBGRAToU = {
+ 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
+};
+
+static const vec8 kBGRAToV = {
+ 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
+};
+
+// Constants for ABGR.
+static const vec8 kABGRToY = {
+ 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
+};
+
+static const vec8 kABGRToU = {
+ -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
+};
+
+static const vec8 kABGRToV = {
+ 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
+};
+
+// Constants for RGBA.
+static const vec8 kRGBAToY = {
+ 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
+};
+
+static const vec8 kRGBAToU = {
+ 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
+};
+
+static const vec8 kRGBAToV = {
+ 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
+};
+
+static const uvec8 kAddY16 = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+static const vec16 kAddYJ64 = {
+ 64, 64, 64, 64, 64, 64, 64, 64
+};
+static const lvec16 kAddYJ64_AVX = {
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+};
+
+static const ulvec8 kAddY16_AVX = {
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+ 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
+};
+
+static const uvec8 kAddUV128 = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+static const uvec16 kAddUVJ128 = {
+ 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
+};
+
+static const ulvec8 kAddUV128_AVX = {
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+ 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting RGB24 to ARGB.
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+ 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
+};
+
+// Shuffle table for converting RAW to ARGB.
+static const uvec8 kShuffleMaskRAWToARGB = {
+ 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
+};
+
+// Shuffle table for converting ARGB to RGB24.
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
+};
+
+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+ 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
+};
+
+// Shuffle table for converting ARGB to RAW.
+static const uvec8 kShuffleMaskARGBToRAW_0 = {
+ 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
+};
+
+// Duplicates gray value 3 times and fills in alpha opaque.
+__declspec(naked) __declspec(align(16))
+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+
+ align 16
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0
+ punpckhwd xmm1, xmm1
+ por xmm0, xmm5
+ por xmm1, xmm5
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_y
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+
+ align 16
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm0
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0
+ punpckhwd xmm1, xmm1
+ por xmm0, xmm5
+ por xmm1, xmm5
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_rgb24
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, kShuffleMaskRGB24ToARGB
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqa [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm5
+ sub ecx, 16
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_raw
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0xff000000
+ pslld xmm5, 24
+ movdqa xmm4, kShuffleMaskRAWToARGB
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm3, [eax + 32]
+ lea eax, [eax + 48]
+ movdqa xmm2, xmm3
+ palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
+ pshufb xmm2, xmm4
+ por xmm2, xmm5
+ palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
+ pshufb xmm0, xmm4
+ movdqa [edx + 32], xmm2
+ por xmm0, xmm5
+ pshufb xmm1, xmm4
+ movdqa [edx], xmm0
+ por xmm1, xmm5
+ palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
+ pshufb xmm3, xmm4
+ movdqa [edx + 16], xmm1
+ por xmm3, xmm5
+ sub ecx, 16
+ movdqa [edx + 48], xmm3
+ lea edx, [edx + 64]
+ jg convertloop
+ ret
+ }
+}
+
+// pmul method to replicate bits.
+// Math to replicate bits:
+// (v << 8) | (v << 3)
+// v * 256 + v * 8
+// v * (256 + 8)
+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
+// 20 instructions.
+__declspec(naked) __declspec(align(16))
+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
+ psllw xmm4, 10
+ psrlw xmm4, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_rgb565
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgr565
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ pand xmm1, xmm3 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ pand xmm0, xmm4 // G in middle 6 bits
+ pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
+ por xmm0, xmm7 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 24 instructions
+__declspec(naked) __declspec(align(16))
+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x01080108 // generate multiplier to repeat 5 bits
+ movd xmm5, eax
+ pshufd xmm5, xmm5, 0
+ mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
+ movd xmm6, eax
+ pshufd xmm6, xmm6, 0
+ pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
+ psllw xmm3, 11
+ movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
+ psrlw xmm4, 6
+ pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
+ psllw xmm7, 8
+
+ mov eax, [esp + 4] // src_argb1555
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of 1555
+ movdqa xmm1, xmm0
+ movdqa xmm2, xmm0
+ psllw xmm1, 1 // R in upper 5 bits
+ psllw xmm2, 11 // B in upper 5 bits
+ pand xmm1, xmm3
+ pmulhuw xmm2, xmm5 // * (256 + 8)
+ pmulhuw xmm1, xmm5 // * (256 + 8)
+ psllw xmm1, 8
+ por xmm1, xmm2 // RB
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // G in middle 5 bits
+ psraw xmm2, 8 // A
+ pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
+ pand xmm2, xmm7
+ por xmm0, xmm2 // AG
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+// 18 instructions.
+__declspec(naked) __declspec(align(16))
+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) {
+ __asm {
+ mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
+ movd xmm4, eax
+ pshufd xmm4, xmm4, 0
+ movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
+ pslld xmm5, 4
+ mov eax, [esp + 4] // src_argb4444
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // pix
+ sub edx, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
+ movdqa xmm2, xmm0
+ pand xmm0, xmm4 // mask low nibbles
+ pand xmm2, xmm5 // mask high nibbles
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ psllw xmm1, 4
+ psrlw xmm3, 4
+ por xmm0, xmm1
+ por xmm2, xmm3
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
+ movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
+ lea eax, [eax + 16]
+ sub ecx, 8
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRGB24
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 16 pixels of argb
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqa [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqa [edx + 16], xmm1 // store 1
+ movdqa [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ movdqa xmm6, kShuffleMaskARGBToRAW
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 16 pixels of argb
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
+ pshufb xmm1, xmm6
+ pshufb xmm2, xmm6
+ pshufb xmm3, xmm6
+ movdqa xmm4, xmm1 // 4 bytes from 1 for 0
+ psrldq xmm1, 4 // 8 bytes from 1
+ pslldq xmm4, 12 // 4 bytes from 1 for 0
+ movdqa xmm5, xmm2 // 8 bytes from 2 for 1
+ por xmm0, xmm4 // 4 bytes from 1 for 0
+ pslldq xmm5, 8 // 8 bytes from 2 for 1
+ movdqa [edx], xmm0 // store 0
+ por xmm1, xmm5 // 8 bytes from 2 for 1
+ psrldq xmm2, 8 // 4 bytes from 2
+ pslldq xmm3, 4 // 12 bytes from 3 for 2
+ por xmm2, xmm3 // 12 bytes from 3 for 2
+ movdqa [edx + 16], xmm1 // store 1
+ movdqa [edx + 32], xmm2 // store 2
+ lea edx, [edx + 48]
+ sub ecx, 16
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
+ psrld xmm3, 27
+ pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
+ psrld xmm4, 26
+ pslld xmm4, 5
+ pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
+ pslld xmm5, 11
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm1, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm1, xmm3 // B
+ pand xmm2, xmm4 // G
+ pand xmm0, xmm5 // R
+ por xmm1, xmm2 // BG
+ por xmm0, xmm1 // BGR
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+// TODO(fbarchard): Improve sign extension/packing.
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
+ psrld xmm4, 27
+ movdqa xmm5, xmm4 // generate mask 0x000003e0
+ pslld xmm5, 5
+ movdqa xmm6, xmm4 // generate mask 0x00007c00
+ pslld xmm6, 10
+ pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
+ pslld xmm7, 15
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0 // B
+ movdqa xmm2, xmm0 // G
+ movdqa xmm3, xmm0 // R
+ psrad xmm0, 16 // A
+ psrld xmm1, 3 // B
+ psrld xmm2, 6 // G
+ psrld xmm3, 9 // R
+ pand xmm0, xmm7 // A
+ pand xmm1, xmm4 // B
+ pand xmm2, xmm5 // G
+ pand xmm3, xmm6 // R
+ por xmm0, xmm1 // BA
+ por xmm2, xmm3 // GR
+ por xmm0, xmm2 // BGRA
+ packssdw xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_rgb
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
+ psllw xmm4, 12
+ movdqa xmm3, xmm4 // generate mask 0x00f000f0
+ psrlw xmm3, 8
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // fetch 4 pixels of argb
+ movdqa xmm1, xmm0
+ pand xmm0, xmm3 // low nibble
+ pand xmm1, xmm4 // high nibble
+ psrl xmm0, 4
+ psrl xmm1, 8
+ por xmm0, xmm1
+ packuswb xmm0, xmm0
+ lea eax, [eax + 16]
+ movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
+ lea edx, [edx + 8]
+ sub ecx, 4
+ jg convertloop
+ ret
+ }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) __declspec(align(16))
+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kARGBToY
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
+__declspec(naked) __declspec(align(16))
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ paddw xmm2, xmm5
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) __declspec(align(32))
+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ vmovdqa ymm6, kShufARGBToY_AVX
+ vmovdqa ymm5, kAddY16_AVX
+ vmovdqa ymm4, kARGBToY_AVX
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ vpaddb ymm0, ymm0, ymm5
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_AVX2
+
+#ifdef HAS_ARGBTOYROW_AVX2
+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
+__declspec(naked) __declspec(align(32))
+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ vmovdqa ymm4, kARGBToYJ_AVX
+ vmovdqa ymm5, kAddYJ64_AVX
+ vmovdqa ymm6, kShufARGBToY_AVX
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpmaddubsw ymm0, ymm0, ymm4
+ vpmaddubsw ymm1, ymm1, ymm4
+ vpmaddubsw ymm2, ymm2, ymm4
+ vpmaddubsw ymm3, ymm3, ymm4
+ lea eax, [eax + 128]
+ vphaddw ymm0, ymm0, ymm1 // mutates.
+ vphaddw ymm2, ymm2, ymm3
+ vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
+ vpaddw ymm2, ymm2, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm2, ymm2, 7
+ vpackuswb ymm0, ymm0, ymm2 // mutates.
+ vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOYJROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kARGBToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ paddw xmm0, xmm5
+ paddw xmm2, xmm5
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kBGRAToY
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kBGRAToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kABGRToY
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kABGRToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kRGBAToY
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_y */
+ mov ecx, [esp + 12] /* pix */
+ movdqa xmm5, kAddY16
+ movdqa xmm4, kRGBAToY
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ pmaddubsw xmm2, xmm4
+ pmaddubsw xmm3, xmm4
+ lea eax, [eax + 64]
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psrlw xmm0, 7
+ psrlw xmm2, 7
+ packuswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToUJ
+ movdqa xmm6, kARGBToVJ
+ movdqa xmm5, kAddUVJ128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm1, xmm5
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_ARGBTOUVROW_AVX2
+__declspec(naked) __declspec(align(32))
+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ vmovdqa ymm7, kARGBToU_AVX
+ vmovdqa ymm6, kARGBToV_AVX
+ vmovdqa ymm5, kAddUV128_AVX
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 32x2 argb pixels to 16x1 */
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vmovdqu ymm2, [eax + 64]
+ vmovdqu ymm3, [eax + 96]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ vpavgb ymm2, ymm2, [eax + esi + 64]
+ vpavgb ymm3, ymm3, [eax + esi + 96]
+ lea eax, [eax + 128]
+ vshufps ymm4, ymm0, ymm1, 0x88
+ vshufps ymm0, ymm0, ymm1, 0xdd
+ vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
+ vshufps ymm4, ymm2, ymm3, 0x88
+ vshufps ymm2, ymm2, ymm3, 0xdd
+ vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 32 different pixels, its 16 pixels of U and 16 of V
+ vpmaddubsw ymm1, ymm0, ymm7 // U
+ vpmaddubsw ymm3, ymm2, ymm7
+ vpmaddubsw ymm0, ymm0, ymm6 // V
+ vpmaddubsw ymm2, ymm2, ymm6
+ vphaddw ymm1, ymm1, ymm3 // mutates
+ vphaddw ymm0, ymm0, ymm2
+ vpsraw ymm1, ymm1, 8
+ vpsraw ymm0, ymm0, 8
+ vpacksswb ymm0, ymm1, ymm0 // mutates
+ vpermq ymm0, ymm0, 0xd8 // For vpacksswb
+ vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
+ vpaddb ymm0, ymm0, ymm5 // -> unsigned
+
+ // step 3 - store 16 U and 16 V values
+ sub ecx, 32
+ vextractf128 [edx], ymm0, 0 // U
+ vextractf128 [edx + edi], ymm0, 1 // V
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBTOUVROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kARGBToUJ
+ movdqa xmm6, kARGBToVJ
+ movdqa xmm5, kAddUVJ128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ paddw xmm0, xmm5 // +.5 rounding -> unsigned
+ paddw xmm1, xmm5
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* convert to U and V */
+ movdqa xmm0, [eax] // U
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [eax] // V
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm6
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm2, xmm6
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ lea eax, [eax + 64]
+ movdqa [edx + edi], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* convert to U and V */
+ movdqu xmm0, [eax] // U
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm1, xmm7
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm3, xmm7
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ sub ecx, 16
+ movdqu [edx], xmm0
+
+ movdqu xmm0, [eax] // V
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ pmaddubsw xmm0, xmm6
+ pmaddubsw xmm1, xmm6
+ pmaddubsw xmm2, xmm6
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm1
+ phaddw xmm2, xmm3
+ psraw xmm0, 8
+ psraw xmm2, 8
+ packsswb xmm0, xmm2
+ paddb xmm0, xmm5
+ lea eax, [eax + 64]
+ movdqu [edx + edi], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ movdqa xmm7, kARGBToU
+ movdqa xmm6, kARGBToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kBGRAToU
+ movdqa xmm6, kBGRAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kBGRAToU
+ movdqa xmm6, kBGRAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kABGRToU
+ movdqa xmm6, kABGRToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kABGRToU
+ movdqa xmm6, kABGRToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kRGBAToU
+ movdqa xmm6, kRGBAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+ pavgb xmm0, [eax + esi]
+ pavgb xmm1, [eax + esi + 16]
+ pavgb xmm2, [eax + esi + 32]
+ pavgb xmm3, [eax + esi + 48]
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov esi, [esp + 8 + 8] // src_stride_argb
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ movdqa xmm7, kRGBAToU
+ movdqa xmm6, kRGBAToV
+ movdqa xmm5, kAddUV128
+ sub edi, edx // stride from u to v
+
+ align 16
+ convertloop:
+ /* step 1 - subsample 16x2 argb pixels to 8x1 */
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
+ movdqu xmm4, [eax + esi]
+ pavgb xmm0, xmm4
+ movdqu xmm4, [eax + esi + 16]
+ pavgb xmm1, xmm4
+ movdqu xmm4, [eax + esi + 32]
+ pavgb xmm2, xmm4
+ movdqu xmm4, [eax + esi + 48]
+ pavgb xmm3, xmm4
+ lea eax, [eax + 64]
+ movdqa xmm4, xmm0
+ shufps xmm0, xmm1, 0x88
+ shufps xmm4, xmm1, 0xdd
+ pavgb xmm0, xmm4
+ movdqa xmm4, xmm2
+ shufps xmm2, xmm3, 0x88
+ shufps xmm4, xmm3, 0xdd
+ pavgb xmm2, xmm4
+
+ // step 2 - convert to U and V
+ // from here down is very similar to Y code except
+ // instead of 16 different pixels, its 8 pixels of U and 8 of V
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ pmaddubsw xmm0, xmm7 // U
+ pmaddubsw xmm2, xmm7
+ pmaddubsw xmm1, xmm6 // V
+ pmaddubsw xmm3, xmm6
+ phaddw xmm0, xmm2
+ phaddw xmm1, xmm3
+ psraw xmm0, 8
+ psraw xmm1, 8
+ packsswb xmm0, xmm1
+ paddb xmm0, xmm5 // -> unsigned
+
+ // step 3 - store 8 U and 8 V values
+ sub ecx, 16
+ movlps qword ptr [edx], xmm0 // U
+ movhps qword ptr [edx + edi], xmm0 // V
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBTOYROW_SSSE3
+
+#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
+#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
+#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+#ifdef HAS_I422TOARGBROW_AVX2
+
+static const lvec8 kUVToB_AVX = {
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+static const lvec8 kUVToR_AVX = {
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+static const lvec8 kUVToG_AVX = {
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+static const lvec16 kYToRgb_AVX = {
+ YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
+};
+static const lvec16 kYSub16_AVX = {
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+};
+static const lvec16 kUVBiasB_AVX = {
+ BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
+};
+static const lvec16 kUVBiasG_AVX = {
+ BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
+};
+static const lvec16 kUVBiasR_AVX = {
+ BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
+};
+
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_AVX2(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
+ vpxor ymm4, ymm4, ymm4
+
+ align 16
+ convertloop:
+ vmovq xmm0, qword ptr [esi] // U
+ vmovq xmm1, qword ptr [esi + edi] // V
+ lea esi, [esi + 8]
+ vpunpcklbw ymm0, ymm0, ymm1 // UV
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklwd ymm0, ymm0, ymm0 // UVUV
+ vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
+ vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
+ vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
+ vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
+ vpsubw ymm1, ymm1, kUVBiasG_AVX
+ vpsubw ymm0, ymm0, kUVBiasR_AVX
+
+ // Step 2: Find Y contribution to 16 R,G,B values
+ vmovdqu xmm3, [eax] // NOLINT
+ lea eax, [eax + 16]
+ vpermq ymm3, ymm3, 0xd8
+ vpunpcklbw ymm3, ymm3, ymm4
+ vpsubsw ymm3, ymm3, kYSub16_AVX
+ vpmullw ymm3, ymm3, kYToRgb_AVX
+ vpaddsw ymm2, ymm2, ymm3 // B += Y
+ vpaddsw ymm1, ymm1, ymm3 // G += Y
+ vpaddsw ymm0, ymm0, ymm3 // R += Y
+ vpsraw ymm2, ymm2, 6
+ vpsraw ymm1, ymm1, 6
+ vpsraw ymm0, ymm0, 6
+ vpackuswb ymm2, ymm2, ymm2 // B
+ vpackuswb ymm1, ymm1, ymm1 // G
+ vpackuswb ymm0, ymm0, ymm0 // R
+
+ // Step 3: Weave into ARGB
+ vpunpcklbw ymm2, ymm2, ymm1 // BG
+ vpermq ymm2, ymm2, 0xd8
+ vpunpcklbw ymm0, ymm0, ymm5 // RA
+ vpermq ymm0, ymm0, 0xd8
+ vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
+ vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+ vzeroupper
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_I422TOARGBROW_AVX2
+
+#ifdef HAS_I422TOARGBROW_SSSE3
+
+static const vec8 kUVToB = {
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+ VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+ VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
+
+// Read 8 UV from 411.
+#define READYUV444 __asm { \
+ __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
+ __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ }
+
+// Read 4 UV from 422, upsample to 8 UV.
+#define READYUV422 __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 4] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Read 2 UV from 411, upsample to 8 UV.
+#define READYUV411 __asm { \
+ __asm movd xmm0, [esi] /* U */ \
+ __asm movd xmm1, [esi + edi] /* V */ \
+ __asm lea esi, [esi + 2] \
+ __asm punpcklbw xmm0, xmm1 /* UV */ \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Read 4 UV from NV12, upsample to 8 UV.
+#define READNV12 __asm { \
+ __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
+ __asm lea esi, [esi + 8] \
+ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
+ }
+
+// Convert 8 pixels: 8 UV and 8 Y.
+#define YUVTORGB __asm { \
+ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
+ __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
+ __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
+ __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
+ __asm psubw xmm1, kUVBiasG \
+ __asm psubw xmm2, kUVBiasR \
+ /* Step 2: Find Y contribution to 8 R,G,B values */ \
+ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm3, xmm4 \
+ __asm psubsw xmm3, kYSub16 \
+ __asm pmullw xmm3, kYToRgb \
+ __asm paddsw xmm0, xmm3 /* B += Y */ \
+ __asm paddsw xmm1, xmm3 /* G += Y */ \
+ __asm paddsw xmm2, xmm3 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// Convert 8 pixels: 8 VU and 8 Y.
+#define YVUTORGB __asm { \
+ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
+ __asm movdqa xmm1, xmm0 \
+ __asm movdqa xmm2, xmm0 \
+ __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
+ __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
+ __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
+ __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
+ __asm psubw xmm1, kUVBiasG \
+ __asm psubw xmm2, kUVBiasR \
+ /* Step 2: Find Y contribution to 8 R,G,B values */ \
+ __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
+ __asm lea eax, [eax + 8] \
+ __asm punpcklbw xmm3, xmm4 \
+ __asm psubsw xmm3, kYSub16 \
+ __asm pmullw xmm3, kYToRgb \
+ __asm paddsw xmm0, xmm3 /* B += Y */ \
+ __asm paddsw xmm1, xmm3 /* G += Y */ \
+ __asm paddsw xmm2, xmm3 /* R += Y */ \
+ __asm psraw xmm0, 6 \
+ __asm psraw xmm1, 6 \
+ __asm psraw xmm2, 6 \
+ __asm packuswb xmm0, xmm0 /* B */ \
+ __asm packuswb xmm1, xmm1 /* G */ \
+ __asm packuswb xmm2, xmm2 /* R */ \
+ }
+
+// 8 pixels, dest aligned 16.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV444
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRGB24Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgb24,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgb24
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+ movdqa xmm5, kShuffleMaskARGBToRGB24_0
+ movdqa xmm6, kShuffleMaskARGBToRGB24
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RRGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm2 // RR
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRR first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRR next 4 pixels
+ pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
+ pshufb xmm1, xmm6 // Pack into first 12 bytes.
+ palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
+ movq qword ptr [edx], xmm0 // First 8 bytes
+ movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
+ lea edx, [edx + 24]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRAWRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_raw,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // raw
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+ movdqa xmm5, kShuffleMaskARGBToRAW_0
+ movdqa xmm6, kShuffleMaskARGBToRAW
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RRGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm2 // RR
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRR first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRR next 4 pixels
+ pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
+ pshufb xmm1, xmm6 // Pack into first 12 bytes.
+ palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
+ movq qword ptr [edx], xmm0 // First 8 bytes
+ movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
+ lea edx, [edx + 24]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest unaligned.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToRGB565Row_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb565_buf,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgb565
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+ pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
+ psrld xmm5, 27
+ pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
+ psrld xmm6, 26
+ pslld xmm6, 5
+ pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
+ pslld xmm7, 11
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RRGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm2 // RR
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRR first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRR next 4 pixels
+
+ // Step 3b: RRGB -> RGB565
+ movdqa xmm3, xmm0 // B first 4 pixels of argb
+ movdqa xmm2, xmm0 // G
+ pslld xmm0, 8 // R
+ psrld xmm3, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm0, 16 // R
+ pand xmm3, xmm5 // B
+ pand xmm2, xmm6 // G
+ pand xmm0, xmm7 // R
+ por xmm3, xmm2 // BG
+ por xmm0, xmm3 // BGR
+ movdqa xmm3, xmm1 // B next 4 pixels of argb
+ movdqa xmm2, xmm1 // G
+ pslld xmm1, 8 // R
+ psrld xmm3, 3 // B
+ psrld xmm2, 5 // G
+ psrad xmm1, 16 // R
+ pand xmm3, xmm5 // B
+ pand xmm2, xmm6 // G
+ pand xmm1, xmm7 // R
+ por xmm3, xmm2 // BG
+ por xmm1, xmm3 // BGR
+ packssdw xmm0, xmm1
+ sub ecx, 8
+ movdqu [edx], xmm0 // store 8 pixels of RGB565
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV411
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // VU
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YVUTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV444
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, unaligned.
+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+// Similar to I420 but duplicate UV once more.
+__declspec(naked) __declspec(align(16))
+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // argb
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV411
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // UV
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+// 8 pixels, dest aligned 16.
+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
+__declspec(naked) __declspec(align(16))
+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // Y
+ mov esi, [esp + 4 + 8] // VU
+ mov edx, [esp + 4 + 12] // argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READNV12
+ YVUTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm0, xmm1 // BG
+ punpcklbw xmm2, xmm5 // RA
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm2 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm2 // BGRA next 4 pixels
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // bgra
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into BGRA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm0 // GB
+ punpcklbw xmm5, xmm2 // AR
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // BGRA first 4 pixels
+ punpckhwd xmm0, xmm1 // BGRA next 4 pixels
+ movdqa [edx], xmm5
+ movdqa [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_bgra,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // bgra
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into BGRA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm0 // GB
+ punpcklbw xmm5, xmm2 // AR
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // BGRA first 4 pixels
+ punpckhwd xmm0, xmm1 // BGRA next 4 pixels
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // abgr
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm2, xmm1 // RG
+ punpcklbw xmm0, xmm5 // BA
+ movdqa xmm1, xmm2
+ punpcklwd xmm2, xmm0 // RGBA first 4 pixels
+ punpckhwd xmm1, xmm0 // RGBA next 4 pixels
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_abgr,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // abgr
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into ARGB
+ punpcklbw xmm2, xmm1 // RG
+ punpcklbw xmm0, xmm5 // BA
+ movdqa xmm1, xmm2
+ punpcklwd xmm2, xmm0 // RGBA first 4 pixels
+ punpckhwd xmm1, xmm0 // RGBA next 4 pixels
+ movdqu [edx], xmm2
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgba
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RGBA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm2 // GR
+ punpcklbw xmm5, xmm0 // AB
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // RGBA first 4 pixels
+ punpckhwd xmm0, xmm1 // RGBA next 4 pixels
+ movdqa [edx], xmm5
+ movdqa [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_rgba,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // Y
+ mov esi, [esp + 8 + 8] // U
+ mov edi, [esp + 8 + 12] // V
+ mov edx, [esp + 8 + 16] // rgba
+ mov ecx, [esp + 8 + 20] // width
+ sub edi, esi
+ pxor xmm4, xmm4
+
+ align 16
+ convertloop:
+ READYUV422
+ YUVTORGB
+
+ // Step 3: Weave into RGBA
+ pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
+ punpcklbw xmm1, xmm2 // GR
+ punpcklbw xmm5, xmm0 // AB
+ movdqa xmm0, xmm5
+ punpcklwd xmm5, xmm1 // RGBA first 4 pixels
+ punpckhwd xmm0, xmm1 // RGBA next 4 pixels
+ movdqu [edx], xmm5
+ movdqu [edx + 16], xmm0
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#endif // HAS_I422TOARGBROW_SSSE3
+
+#ifdef HAS_YTOARGBROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YToARGBRow_SSE2(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) {
+ __asm {
+ pxor xmm5, xmm5
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ mov eax, 0x00100010
+ movd xmm3, eax
+ pshufd xmm3, xmm3, 0
+ mov eax, 0x004a004a // 74
+ movd xmm2, eax
+ pshufd xmm2, xmm2,0
+ mov eax, [esp + 4] // Y
+ mov edx, [esp + 8] // rgb
+ mov ecx, [esp + 12] // width
+
+ align 16
+ convertloop:
+ // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+ movq xmm0, qword ptr [eax]
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm5 // 0.Y
+ psubusw xmm0, xmm3
+ pmullw xmm0, xmm2
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0 // G
+
+ // Step 2: Weave into ARGB
+ punpcklbw xmm0, xmm0 // GG
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm0 // BGRA first 4 pixels
+ punpckhwd xmm1, xmm1 // BGRA next 4 pixels
+ por xmm0, xmm4
+ por xmm1, xmm4
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_YTOARGBROW_SSE2
+
+#ifdef HAS_MIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleMirror = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ movdqa xmm5, kShuffleMirror
+ lea eax, [eax - 16]
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax + ecx]
+ pshufb xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSSE3
+
+#ifdef HAS_MIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec8 kShuffleMirror_AVX2 = {
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
+ 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ vmovdqa ymm5, kShuffleMirror_AVX2
+ lea eax, [eax - 32]
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax + ecx]
+ vpshufb ymm0, ymm0, ymm5
+ vpermq ymm0, ymm0, 0x4e // swap high and low halfs
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_AVX2
+
+#ifdef HAS_MIRRORROW_SSE2
+// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
+// version can not.
+__declspec(naked) __declspec(align(16))
+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 16]
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax + ecx]
+ movdqa xmm1, xmm0 // swap bytes
+ psllw xmm0, 8
+ psrlw xmm1, 8
+ por xmm0, xmm1
+ pshuflw xmm0, xmm0, 0x1b // swap words
+ pshufhw xmm0, xmm0, 0x1b
+ pshufd xmm0, xmm0, 0x4e // swap qwords
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_SSE2
+
+#ifdef HAS_MIRRORROW_UV_SSSE3
+// Shuffle table for reversing the bytes of UV channels.
+static const uvec8 kShuffleMirrorUV = {
+ 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
+};
+
+__declspec(naked) __declspec(align(16))
+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // width
+ movdqa xmm1, kShuffleMirrorUV
+ lea eax, [eax + ecx * 2 - 16]
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ lea eax, [eax - 16]
+ pshufb xmm0, xmm1
+ sub ecx, 8
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [edx + edi], xmm0
+ lea edx, [edx + 8]
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MIRRORROW_UV_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_SSSE3
+// Shuffle table for reversing the bytes.
+static const uvec8 kARGBShuffleMirror = {
+ 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
+};
+
+__declspec(naked) __declspec(align(16))
+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ movdqa xmm5, kARGBShuffleMirror
+ lea eax, [eax - 16]
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax + ecx * 4]
+ pshufb xmm0, xmm5
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_SSSE3
+
+#ifdef HAS_ARGBMIRRORROW_AVX2
+// Shuffle table for reversing the bytes.
+static const ulvec32 kARGBShuffleMirror_AVX2 = {
+ 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
+};
+
+__declspec(naked) __declspec(align(16))
+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // width
+ lea eax, [eax - 32]
+ vmovdqa ymm5, kARGBShuffleMirror_AVX2
+
+ align 16
+ convertloop:
+ vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
+ sub ecx, 8
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMIRRORROW_AVX2
+
+#ifdef HAS_SPLITUVROW_SSE2
+__declspec(naked) __declspec(align(16))
+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pand xmm0, xmm5 // even bytes
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm2, 8 // odd bytes
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqa [edx], xmm0
+ movdqa [edx + edi], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ pand xmm0, xmm5 // even bytes
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm2, 8 // odd bytes
+ psrlw xmm3, 8
+ packuswb xmm2, xmm3
+ movdqu [edx], xmm0
+ movdqu [edx + edi], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_SPLITUVROW_SSE2
+
+#ifdef HAS_SPLITUVROW_AVX2
+__declspec(naked) __declspec(align(16))
+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm2, ymm0, 8 // odd bytes
+ vpsrlw ymm3, ymm1, 8
+ vpand ymm0, ymm0, ymm5 // even bytes
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1
+ vpackuswb ymm2, ymm2, ymm3
+ vpermq ymm0, ymm0, 0xd8
+ vpermq ymm2, ymm2, 0xd8
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + edi], ymm2
+ lea edx, [edx + 32]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_SPLITUVROW_AVX2
+
+#ifdef HAS_MERGEUVROW_SSE2
+__declspec(naked) __declspec(align(16))
+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 16 U's
+ movdqa xmm1, [eax + edx] // and 16 V's
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
+ uint8* dst_uv, int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // read 16 U's
+ movdqu xmm1, [eax + edx] // and 16 V's
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ punpcklbw xmm0, xmm1 // first 8 UV pairs
+ punpckhbw xmm2, xmm1 // next 8 UV pairs
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_SSE2
+
+#ifdef HAS_MERGEUVROW_AVX2
+__declspec(naked) __declspec(align(16))
+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_u
+ mov edx, [esp + 4 + 8] // src_v
+ mov edi, [esp + 4 + 12] // dst_uv
+ mov ecx, [esp + 4 + 16] // width
+ sub edx, eax
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax] // read 32 U's
+ vmovdqu ymm1, [eax + edx] // and 32 V's
+ lea eax, [eax + 32]
+ vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
+ vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
+ vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
+ vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
+ vmovdqu [edi], ymm1
+ vmovdqu [edi + 32], ymm2
+ lea edi, [edi + 64]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_MERGEUVROW_AVX2
+
+#ifdef HAS_COPYROW_SSE2
+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) __declspec(align(16))
+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa [eax + edx], xmm0
+ movdqa [eax + edx + 16], xmm1
+ lea eax, [eax + 32]
+ sub ecx, 32
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_COPYROW_SSE2
+
+// Unaligned Multiple of 1.
+__declspec(naked) __declspec(align(16))
+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, esi
+ mov edx, edi
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ rep movsb
+ mov edi, edx
+ mov esi, eax
+ ret
+ }
+}
+
+#ifdef HAS_COPYROW_X86
+__declspec(naked) __declspec(align(16))
+void CopyRow_X86(const uint8* src, uint8* dst, int count) {
+ __asm {
+ mov eax, esi
+ mov edx, edi
+ mov esi, [esp + 4] // src
+ mov edi, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ shr ecx, 2
+ rep movsd
+ mov edi, edx
+ mov esi, eax
+ ret
+ }
+}
+#endif // HAS_COPYROW_X86
+
+#ifdef HAS_SETROW_X86
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void SetRow_X86(uint8* dst, uint32 v32, int count) {
+ __asm {
+ mov edx, edi
+ mov edi, [esp + 4] // dst
+ mov eax, [esp + 8] // v32
+ mov ecx, [esp + 12] // count
+ shr ecx, 2
+ rep stosd
+ mov edi, edx
+ ret
+ }
+}
+
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+__declspec(naked) __declspec(align(16))
+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ __asm {
+ push esi
+ push edi
+ push ebp
+ mov edi, [esp + 12 + 4] // dst
+ mov eax, [esp + 12 + 8] // v32
+ mov ebp, [esp + 12 + 12] // width
+ mov edx, [esp + 12 + 16] // dst_stride
+ mov esi, [esp + 12 + 20] // height
+ lea ecx, [ebp * 4]
+ sub edx, ecx // stride - width * 4
+
+ align 16
+ convertloop:
+ mov ecx, ebp
+ rep stosd
+ add edi, edx
+ sub esi, 1
+ jg convertloop
+
+ pop ebp
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SETROW_X86
+
+#ifdef HAS_YUY2TOYROW_AVX2
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_AVX2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // even bytes are Y
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_AVX2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpsrlw ymm0, ymm0, 8 // odd bytes are Y
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ sub ecx, 32
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
+ jg convertloop
+ ret
+ vzeroupper
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ vpavgb ymm0, ymm0, [eax + esi]
+ vpavgb ymm1, ymm1, [eax + esi + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
+ vpsrlw ymm5, ymm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
+ vpand ymm1, ymm1, ymm5
+ vpackuswb ymm0, ymm0, ymm1 // mutates.
+ vpermq ymm0, ymm0, 0xd8
+ vpand ymm1, ymm0, ymm5 // U
+ vpsrlw ymm0, ymm0, 8 // V
+ vpackuswb ymm1, ymm1, ymm1 // mutates.
+ vpackuswb ymm0, ymm0, ymm0 // mutates.
+ vpermq ymm1, ymm1, 0xd8
+ vpermq ymm0, ymm0, 0xd8
+ vextractf128 [edx], ymm1, 0 // U
+ vextractf128 [edx + edi], ymm0, 0 // V
+ lea edx, [edx + 16]
+ sub ecx, 32
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_AVX2
+
+#ifdef HAS_YUY2TOYROW_SSE2
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_yuy2
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // even bytes are Y
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // YUYV -> UVUV
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_y, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_uyvy
+ mov edx, [esp + 8] // dst_y
+ mov ecx, [esp + 12] // pix
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // odd bytes are Y
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_yuy2
+ mov esi, [esp + 8 + 8] // stride_yuy2
+ mov edx, [esp + 8 + 12] // dst_u
+ mov edi, [esp + 8 + 16] // dst_v
+ mov ecx, [esp + 8 + 20] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_yuy2
+ mov edx, [esp + 4 + 8] // dst_u
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+ sub edi, edx
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5 // UYVY -> UVUV
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ movdqa xmm1, xmm0
+ pand xmm0, xmm5 // U
+ packuswb xmm0, xmm0
+ psrlw xmm1, 8 // V
+ packuswb xmm1, xmm1
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + edi], xmm1
+ lea edx, [edx + 8]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ ret
+ }
+}
+#endif // HAS_YUY2TOYROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSE2
+// Blend 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 1
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ sub ecx, 1
+ je convertloop1 // only 1 pixel?
+ jl convertloop1b
+
+ // 1 pixel loop until destination pointer is aligned.
+ alignloop1:
+ test edx, 15 // aligned?
+ je alignloop1b
+ movd xmm3, [eax]
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge alignloop1
+
+ alignloop1b:
+ add ecx, 1 - 4
+ jl convertloop4b
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ psrlw xmm3, 8 // alpha
+ pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+ pshuflw xmm3, xmm3, 0F5h
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSE2
+
+#ifdef HAS_ARGBBLENDROW_SSSE3
+// Shuffle table for isolating alpha.
+static const uvec8 kShuffleAlpha = {
+ 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
+ 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
+};
+// Same as SSE2, but replaces:
+// psrlw xmm3, 8 // alpha
+// pshufhw xmm3, xmm3, 0F5h // 8 alpha words
+// pshuflw xmm3, xmm3, 0F5h
+// with..
+// pshufb xmm3, kShuffleAlpha // alpha
+// Blend 8 pixels at a time.
+
+__declspec(naked) __declspec(align(16))
+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pcmpeqb xmm7, xmm7 // generate constant 1
+ psrlw xmm7, 15
+ pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
+ psrlw xmm6, 8
+ pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
+ psllw xmm5, 8
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+
+ sub ecx, 1
+ je convertloop1 // only 1 pixel?
+ jl convertloop1b
+
+ // 1 pixel loop until destination pointer is aligned.
+ alignloop1:
+ test edx, 15 // aligned?
+ je alignloop1b
+ movd xmm3, [eax]
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge alignloop1
+
+ alignloop1b:
+ add ecx, 1 - 4
+ jl convertloop4b
+
+ test eax, 15 // unaligned?
+ jne convertuloop4
+ test esi, 15 // unaligned?
+ jne convertuloop4
+
+ // 4 pixel loop.
+ convertloop4:
+ movdqa xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqa xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqa xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertloop4
+ jmp convertloop4b
+
+ // 4 pixel unaligned loop.
+ convertuloop4:
+ movdqu xmm3, [eax] // src argb
+ lea eax, [eax + 16]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movdqu xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movdqu xmm1, [esi] // _a_g
+ lea esi, [esi + 16]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jge convertuloop4
+
+ convertloop4b:
+ add ecx, 4 - 1
+ jl convertloop1b
+
+ // 1 pixel loop.
+ convertloop1:
+ movd xmm3, [eax] // src argb
+ lea eax, [eax + 4]
+ movdqa xmm0, xmm3 // src argb
+ pxor xmm3, xmm4 // ~alpha
+ movd xmm2, [esi] // _r_b
+ pshufb xmm3, kShuffleAlpha // alpha
+ pand xmm2, xmm6 // _r_b
+ paddw xmm3, xmm7 // 256 - alpha
+ pmullw xmm2, xmm3 // _r_b * alpha
+ movd xmm1, [esi] // _a_g
+ lea esi, [esi + 4]
+ psrlw xmm1, 8 // _a_g
+ por xmm0, xmm4 // set alpha to 255
+ pmullw xmm1, xmm3 // _a_g * alpha
+ psrlw xmm2, 8 // _r_b convert to 8 bits again
+ paddusb xmm0, xmm2 // + src argb
+ pand xmm1, xmm5 // a_g_ convert to 8 bits again
+ paddusb xmm0, xmm1 // + src argb
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge convertloop1
+
+ convertloop1b:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBBLENDROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+// Attenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ pcmpeqb xmm4, xmm4 // generate mask 0xff000000
+ pslld xmm4, 24
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
+ psrld xmm5, 8
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm0 // first 2
+ pshufhw xmm2, xmm0, 0FFh // 8 alpha words
+ pshuflw xmm2, xmm2, 0FFh
+ pmulhuw xmm0, xmm2 // rgb * a
+ movdqa xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm1 // next 2 pixels
+ pshufhw xmm2, xmm1, 0FFh // 8 alpha words
+ pshuflw xmm2, xmm2, 0FFh
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqa xmm2, [eax] // alphas
+ psrlw xmm0, 8
+ pand xmm2, xmm4
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ pand xmm0, xmm5 // keep original alphas
+ por xmm0, xmm2
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+// Shuffle table duplicating alpha.
+static const uvec8 kShuffleAlpha0 = {
+ 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+};
+static const uvec8 kShuffleAlpha1 = {
+ 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+ 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ pcmpeqb xmm3, xmm3 // generate mask 0xff000000
+ pslld xmm3, 24
+ movdqa xmm4, kShuffleAlpha0
+ movdqa xmm5, kShuffleAlpha1
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ pshufb xmm0, xmm4 // isolate first 2 alphas
+ movdqa xmm1, [eax] // read 4 pixels
+ punpcklbw xmm1, xmm1 // first 2 pixel rgbs
+ pmulhuw xmm0, xmm1 // rgb * a
+ movdqa xmm1, [eax] // read 4 pixels
+ pshufb xmm1, xmm5 // isolate next 2 alphas
+ movdqa xmm2, [eax] // read 4 pixels
+ punpckhbw xmm2, xmm2 // next 2 pixel rgbs
+ pmulhuw xmm1, xmm2 // rgb * a
+ movdqa xmm2, [eax] // mask original alpha
+ pand xmm2, xmm3
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ por xmm0, xmm2 // copy original alpha
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_SSSE3
+
+#ifdef HAS_ARGBATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const ulvec8 kShuffleAlpha_AVX2 = {
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
+ 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
+ 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
+ 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
+};
+__declspec(naked) __declspec(align(16))
+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vmovdqa ymm4, kShuffleAlpha_AVX2
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
+ vpslld ymm5, ymm5, 24
+
+ align 16
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpshufb ymm2, ymm0, ymm4 // low 4 alphas
+ vpshufb ymm3, ymm1, ymm4 // high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * a
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * a
+ vpand ymm6, ymm6, ymm5 // isolate alpha
+ vpsrlw ymm0, ymm0, 8
+ vpsrlw ymm1, ymm1, 8
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ vpor ymm0, ymm0, ymm6 // copy original alpha
+ sub ecx, 8
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBUNATTENUATEROW_SSE2
+// Unattenuate 4 pixels at a time.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb0
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov ecx, [esp + 8 + 12] // width
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 3] // first alpha
+ movzx edi, byte ptr [eax + 7] // second alpha
+ punpcklbw xmm0, xmm0 // first 2
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm0, xmm2 // rgb * a
+
+ movdqa xmm1, [eax] // read 4 pixels
+ movzx esi, byte ptr [eax + 11] // third alpha
+ movzx edi, byte ptr [eax + 15] // forth alpha
+ punpckhbw xmm1, xmm1 // next 2
+ movd xmm2, dword ptr fixed_invtbl8[esi * 4]
+ movd xmm3, dword ptr fixed_invtbl8[edi * 4]
+ pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
+ pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
+ movlhps xmm2, xmm3
+ pmulhuw xmm1, xmm2 // rgb * a
+
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBUNATTENUATEROW_SSE2
+
+#ifdef HAS_ARGBUNATTENUATEROW_AVX2
+// Shuffle table duplicating alpha.
+static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
+ 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
+};
+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
+// USE_GATHER is not on by default, due to being a slow instruction.
+#ifdef USE_GATHER
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
+
+ align 16
+ convertloop:
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
+ vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ sub ecx, 8
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#else // USE_GATHER
+__declspec(naked) __declspec(align(16))
+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ int width) {
+ __asm {
+
+ mov eax, [esp + 4] // src_argb0
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ sub edx, eax
+ vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
+
+ push esi
+ push edi
+
+ align 16
+ convertloop:
+ // replace VPGATHER
+ movzx esi, byte ptr [eax + 3] // alpha0
+ movzx edi, byte ptr [eax + 7] // alpha1
+ vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
+ vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
+ movzx esi, byte ptr [eax + 11] // alpha2
+ movzx edi, byte ptr [eax + 15] // alpha3
+ vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
+ vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
+ vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
+ movzx esi, byte ptr [eax + 19] // alpha4
+ movzx edi, byte ptr [eax + 23] // alpha5
+ vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
+ vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
+ vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
+ movzx esi, byte ptr [eax + 27] // alpha6
+ movzx edi, byte ptr [eax + 31] // alpha7
+ vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
+ vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
+ vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
+ vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
+ vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
+ vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
+ vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+ // end of VPGATHER
+
+ vmovdqu ymm6, [eax] // read 8 pixels.
+ vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
+ vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
+ vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
+ vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
+ vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
+ vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
+ vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
+ vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
+ vpackuswb ymm0, ymm0, ymm1 // unmutated.
+ sub ecx, 8
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // USE_GATHER
+#endif // HAS_ARGBATTENUATEROW_AVX2
+
+#ifdef HAS_ARGBGRAYROW_SSSE3
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* width */
+ movdqa xmm4, kARGBToYJ
+ movdqa xmm5, kAddYJ64
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm0, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm0, xmm1
+ paddw xmm0, xmm5 // Add .5 for rounding.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 G bytes
+ movdqa xmm2, [eax] // A
+ movdqa xmm3, [eax + 16]
+ psrld xmm2, 24
+ psrld xmm3, 24
+ packuswb xmm2, xmm3
+ packuswb xmm2, xmm2 // 8 A bytes
+ movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
+ punpcklbw xmm0, xmm0 // 8 GG words
+ punpcklbw xmm3, xmm2 // 8 GA words
+ movdqa xmm1, xmm0
+ punpcklwd xmm0, xmm3 // GGGA first 4
+ punpckhwd xmm1, xmm3 // GGGA next 4
+ sub ecx, 8
+ movdqa [eax + edx], xmm0
+ movdqa [eax + edx + 16], xmm1
+ lea eax, [eax + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBGRAYROW_SSSE3
+
+#ifdef HAS_ARGBSEPIAROW_SSSE3
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+// Constant for ARGB color to sepia tone.
+static const vec8 kARGBToSepiaB = {
+ 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
+};
+
+static const vec8 kARGBToSepiaG = {
+ 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
+};
+
+static const vec8 kARGBToSepiaR = {
+ 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
+};
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+__declspec(naked) __declspec(align(16))
+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ mov ecx, [esp + 8] /* width */
+ movdqa xmm2, kARGBToSepiaB
+ movdqa xmm3, kARGBToSepiaG
+ movdqa xmm4, kARGBToSepiaR
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // B
+ movdqa xmm6, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm6, xmm2
+ phaddw xmm0, xmm6
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // 8 B values
+ movdqa xmm5, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
+ movdqa xmm5, [eax] // R
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddw xmm5, xmm1
+ psrlw xmm5, 7
+ packuswb xmm5, xmm5 // 8 R values
+ movdqa xmm6, [eax] // A
+ movdqa xmm1, [eax + 16]
+ psrld xmm6, 24
+ psrld xmm1, 24
+ packuswb xmm6, xmm1
+ packuswb xmm6, xmm6 // 8 A values
+ punpcklbw xmm5, xmm6 // 8 RA values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
+ sub ecx, 8
+ movdqa [eax], xmm0
+ movdqa [eax + 16], xmm1
+ lea eax, [eax + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBSEPIAROW_SSSE3
+
+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// Same as Sepia except matrix is provided.
+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
+__declspec(naked) __declspec(align(16))
+void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ mov edx, [esp + 8] /* matrix_argb */
+ mov ecx, [esp + 12] /* width */
+ movd xmm2, [edx]
+ movd xmm3, [edx + 4]
+ movd xmm4, [edx + 8]
+ pshufd xmm2, xmm2, 0
+ pshufd xmm3, xmm3, 0
+ pshufd xmm4, xmm4, 0
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // B
+ movdqa xmm6, [eax + 16]
+ pmaddubsw xmm0, xmm2
+ pmaddubsw xmm6, xmm2
+ movdqa xmm5, [eax] // G
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm3
+ pmaddubsw xmm1, xmm3
+ phaddsw xmm0, xmm6 // B
+ phaddsw xmm5, xmm1 // G
+ psraw xmm0, 7 // B
+ psraw xmm5, 7 // G
+ packuswb xmm0, xmm0 // 8 B values
+ packuswb xmm5, xmm5 // 8 G values
+ punpcklbw xmm0, xmm5 // 8 BG values
+ movdqa xmm5, [eax] // R
+ movdqa xmm1, [eax + 16]
+ pmaddubsw xmm5, xmm4
+ pmaddubsw xmm1, xmm4
+ phaddsw xmm5, xmm1
+ psraw xmm5, 7
+ packuswb xmm5, xmm5 // 8 R values
+ movdqa xmm6, [eax] // A
+ movdqa xmm1, [eax + 16]
+ psrld xmm6, 24
+ psrld xmm1, 24
+ packuswb xmm6, xmm1
+ packuswb xmm6, xmm6 // 8 A values
+ movdqa xmm1, xmm0 // Weave BG, RA together
+ punpcklbw xmm5, xmm6 // 8 RA values
+ punpcklwd xmm0, xmm5 // BGRA first 4
+ punpckhwd xmm1, xmm5 // BGRA next 4
+ sub ecx, 8
+ movdqa [eax], xmm0
+ movdqa [eax + 16], xmm1
+ lea eax, [eax + 32]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 16 + 4] /* dst_argb */
+ mov edi, [esp + 16 + 8] /* table_argb */
+ mov ecx, [esp + 16 + 12] /* width */
+ xor ebx, ebx
+ xor edx, edx
+
+ align 16
+ convertloop:
+ mov ebp, dword ptr [eax] // BGRA
+ mov esi, ebp
+ and ebp, 255
+ shr esi, 8
+ and esi, 255
+ mov bl, [edi + ebp * 4 + 0] // B
+ mov dl, [edi + esi * 4 + 1] // G
+ mov ebp, dword ptr [eax] // BGRA
+ mov esi, ebp
+ shr ebp, 16
+ shr esi, 24
+ and ebp, 255
+ mov [eax], bl
+ mov [eax + 1], dl
+ mov bl, [edi + ebp * 4 + 2] // R
+ mov dl, [edi + esi * 4 + 3] // A
+ mov [eax + 2], bl
+ mov [eax + 3], dl
+ lea eax, [eax + 4]
+ sub ecx, 1
+ jg convertloop
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBQUANTIZEROW_SSE2
+// Quantize 4 ARGB pixels (16 bytes).
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ __asm {
+ mov eax, [esp + 4] /* dst_argb */
+ movd xmm2, [esp + 8] /* scale */
+ movd xmm3, [esp + 12] /* interval_size */
+ movd xmm4, [esp + 16] /* interval_offset */
+ mov ecx, [esp + 20] /* width */
+ pshuflw xmm2, xmm2, 040h
+ pshufd xmm2, xmm2, 044h
+ pshuflw xmm3, xmm3, 040h
+ pshufd xmm3, xmm3, 044h
+ pshuflw xmm4, xmm4, 040h
+ pshufd xmm4, xmm4, 044h
+ pxor xmm5, xmm5 // constant 0
+ pcmpeqb xmm6, xmm6 // generate mask 0xff000000
+ pslld xmm6, 24
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ punpcklbw xmm0, xmm5 // first 2 pixels
+ pmulhuw xmm0, xmm2 // pixel * scale >> 16
+ movdqa xmm1, [eax] // read 4 pixels
+ punpckhbw xmm1, xmm5 // next 2 pixels
+ pmulhuw xmm1, xmm2
+ pmullw xmm0, xmm3 // * interval_size
+ movdqa xmm7, [eax] // read 4 pixels
+ pmullw xmm1, xmm3
+ pand xmm7, xmm6 // mask alpha
+ paddw xmm0, xmm4 // + interval_size / 2
+ paddw xmm1, xmm4
+ packuswb xmm0, xmm1
+ por xmm0, xmm7
+ sub ecx, 4
+ movdqa [eax], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ ret
+ }
+}
+#endif // HAS_ARGBQUANTIZEROW_SSE2
+
+#ifdef HAS_ARGBSHADEROW_SSE2
+// Shade 4 pixels at a time by specified value.
+// Aligned to 16 bytes.
+__declspec(naked) __declspec(align(16))
+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_argb
+ mov ecx, [esp + 12] // width
+ movd xmm2, [esp + 16] // value
+ sub edx, eax
+ punpcklbw xmm2, xmm2
+ punpcklqdq xmm2, xmm2
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 4 pixels
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ pmulhuw xmm0, xmm2 // argb * value
+ pmulhuw xmm1, xmm2 // argb * value
+ psrlw xmm0, 8
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqa [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBSHADEROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_SSE2
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ pxor xmm5, xmm5 // constant 0
+ sub esi, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm2, [eax + esi] // read 4 pixels from src_argb1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_SSE2
+
+#ifdef HAS_ARGBADDROW_SSE2
+// Add 2 rows of ARGB pixels together, 4 pixels at a time.
+// TODO(fbarchard): Port this to posix, neon and other math functions.
+__declspec(naked) __declspec(align(16))
+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+
+ sub ecx, 4
+ jl convertloop49
+
+ align 16
+ convertloop4:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ sub ecx, 4
+ movdqu [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jge convertloop4
+
+ convertloop49:
+ add ecx, 4 - 1
+ jl convertloop19
+
+ convertloop1:
+ movd xmm0, [eax] // read 1 pixels from src_argb0
+ movd xmm1, [eax + esi] // read 1 pixels from src_argb1
+ paddusb xmm0, xmm1 // src_argb0 + src_argb1
+ sub ecx, 1
+ movd [eax + edx], xmm0
+ lea eax, [eax + 4]
+ jge convertloop1
+
+ convertloop19:
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_SSE2
+
+#ifdef HAS_ARGBSUBTRACTROW_SSE2
+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ movdqu xmm0, [eax] // read 4 pixels from src_argb0
+ movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1
+ psubusb xmm0, xmm1 // src_argb0 - src_argb1
+ sub ecx, 4
+ movdqu [eax + edx], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_SSE2
+
+#ifdef HAS_ARGBMULTIPLYROW_AVX2
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ vpxor ymm5, ymm5, ymm5 // constant 0
+ sub esi, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
+ vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1
+ vpunpcklbw ymm0, ymm1, ymm1 // low 4
+ vpunpckhbw ymm1, ymm1, ymm1 // high 4
+ vpunpcklbw ymm2, ymm3, ymm5 // low 4
+ vpunpckhbw ymm3, ymm3, ymm5 // high 4
+ vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
+ vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
+ vpackuswb ymm0, ymm0, ymm1
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBMULTIPLYROW_AVX2
+
+#ifdef HAS_ARGBADDROW_AVX2
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBADDROW_AVX2
+
+#ifdef HAS_ARGBSUBTRACTROW_AVX2
+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
+__declspec(naked) __declspec(align(16))
+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb0
+ mov esi, [esp + 4 + 8] // src_argb1
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
+ vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1
+ vmovdqu [eax + edx], ymm0
+ lea eax, [eax + 32]
+ sub ecx, 8
+ jg convertloop
+
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBSUBTRACTROW_AVX2
+
+#ifdef HAS_SOBELXROW_SSSE3
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+__declspec(naked) __declspec(align(16))
+void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y0
+ mov esi, [esp + 8 + 8] // src_y1
+ mov edi, [esp + 8 + 12] // src_y2
+ mov edx, [esp + 8 + 16] // dst_sobelx
+ mov ecx, [esp + 8 + 20] // width
+ sub esi, eax
+ sub edi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ align 16
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
+ movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead.
+ packuswb xmm0, xmm0
+ sub ecx, 8
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXROW_SSSE3
+
+#ifdef HAS_SOBELYROW_SSSE3
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+__declspec(naked) __declspec(align(16))
+void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_y0
+ mov esi, [esp + 4 + 8] // src_y1
+ mov edx, [esp + 4 + 12] // dst_sobely
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ sub edx, eax
+ pxor xmm5, xmm5 // constant 0
+
+ align 16
+ convertloop:
+ movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
+ movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ psubw xmm0, xmm1
+ movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
+ movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm2, xmm5
+ psubw xmm1, xmm2
+ movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
+ movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
+ punpcklbw xmm2, xmm5
+ punpcklbw xmm3, xmm5
+ psubw xmm2, xmm3
+ paddw xmm0, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm1
+ pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead.
+ packuswb xmm0, xmm0
+ sub ecx, 8
+ movq qword ptr [eax + edx], xmm0
+ lea eax, [eax + 8]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELYROW_SSSE3
+
+#ifdef HAS_SOBELROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+__declspec(naked) __declspec(align(16))
+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+ pslld xmm5, 24 // 0xff000000
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 16 pixels src_sobelx
+ movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ movdqa xmm2, xmm0 // GG
+ punpcklbw xmm2, xmm0 // First 8
+ punpckhbw xmm0, xmm0 // Next 8
+ movdqa xmm1, xmm2 // GGGG
+ punpcklwd xmm1, xmm2 // First 4
+ punpckhwd xmm2, xmm2 // Next 4
+ por xmm1, xmm5 // GGGA
+ por xmm2, xmm5
+ movdqa xmm3, xmm0 // GGGG
+ punpcklwd xmm3, xmm0 // Next 4
+ punpckhwd xmm0, xmm0 // Last 4
+ por xmm3, xmm5 // GGGA
+ por xmm0, xmm5
+ sub ecx, 16
+ movdqa [edx], xmm1
+ movdqa [edx + 16], xmm2
+ movdqa [edx + 32], xmm3
+ movdqa [edx + 48], xmm0
+ lea edx, [edx + 64]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELROW_SSE2
+
+#ifdef HAS_SOBELXYROW_SSE2
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+__declspec(naked) __declspec(align(16))
+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+ pcmpeqb xmm5, xmm5 // alpha 255
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax] // read 16 pixels src_sobelx
+ movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ movdqa xmm2, xmm0
+ paddusb xmm2, xmm1 // sobel = sobelx + sobely
+ movdqa xmm3, xmm0 // XA
+ punpcklbw xmm3, xmm5
+ punpckhbw xmm0, xmm5
+ movdqa xmm4, xmm1 // YS
+ punpcklbw xmm4, xmm2
+ punpckhbw xmm1, xmm2
+ movdqa xmm6, xmm4 // YSXA
+ punpcklwd xmm6, xmm3 // First 4
+ punpckhwd xmm4, xmm3 // Next 4
+ movdqa xmm7, xmm1 // YSXA
+ punpcklwd xmm7, xmm0 // Next 4
+ punpckhwd xmm1, xmm0 // Last 4
+ sub ecx, 16
+ movdqa [edx], xmm6
+ movdqa [edx + 16], xmm4
+ movdqa [edx + 32], xmm7
+ movdqa [edx + 48], xmm1
+ lea edx, [edx + 64]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELXYROW_SSE2
+
+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+// Consider float CumulativeSum.
+// Consider calling CumulativeSum one row at time as needed.
+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
+// Convert cumulative sum for an area to an average for 1 pixel.
+// topleft is pointer to top left of CumulativeSum buffer for area.
+// botleft is pointer to bottom left of CumulativeSum buffer.
+// width is offset from left to right of area in CumulativeSum buffer measured
+// in number of ints.
+// area is the number of pixels in the area being averaged.
+// dst points to pixel to store result to.
+// count is number of averaged pixels to produce.
+// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
+// aligned.
+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
+ int width, int area, uint8* dst,
+ int count) {
+ __asm {
+ mov eax, topleft // eax topleft
+ mov esi, botleft // esi botleft
+ mov edx, width
+ movd xmm4, area
+ mov edi, dst
+ mov ecx, count
+ cvtdq2ps xmm4, xmm4
+ rcpss xmm4, xmm4 // 1.0f / area
+ pshufd xmm4, xmm4, 0
+ sub ecx, 4
+ jl l4b
+
+ // 4 pixel loop
+ align 4
+ l4:
+ // top left
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
+ cvtdq2ps xmm1, xmm1
+ mulps xmm0, xmm4
+ mulps xmm1, xmm4
+ cvtdq2ps xmm2, xmm2
+ cvtdq2ps xmm3, xmm3
+ mulps xmm2, xmm4
+ mulps xmm3, xmm4
+ cvtps2dq xmm0, xmm0
+ cvtps2dq xmm1, xmm1
+ cvtps2dq xmm2, xmm2
+ cvtps2dq xmm3, xmm3
+ packssdw xmm0, xmm1
+ packssdw xmm2, xmm3
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ movdqa xmm0, [eax]
+ psubd xmm0, [eax + edx * 4]
+ lea eax, [eax + 16]
+ psubd xmm0, [esi]
+ paddd xmm0, [esi + edx * 4]
+ lea esi, [esi + 16]
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm4
+ cvtps2dq xmm0, xmm0
+ packssdw xmm0, xmm0
+ packuswb xmm0, xmm0
+ movd dword ptr [edi], xmm0
+ lea edi, [edi + 4]
+ sub ecx, 1
+ jge l1
+ l1b:
+ }
+}
+#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+
+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
+// Creates a table of cumulative sums where each value is a sum of all values
+// above and to the left of the value.
+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
+ const int32* previous_cumsum, int width) {
+ __asm {
+ mov eax, row
+ mov edx, cumsum
+ mov esi, previous_cumsum
+ mov ecx, width
+ sub esi, edx
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+
+ sub ecx, 4
+ jl l4b
+ test edx, 15
+ jne l4b
+
+ // 4 pixel loop
+ align 4
+ l4:
+ movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
+ lea eax, [eax + 16]
+ movdqa xmm4, xmm2
+
+ punpcklbw xmm2, xmm1
+ movdqa xmm3, xmm2
+ punpcklwd xmm2, xmm1
+ punpckhwd xmm3, xmm1
+
+ punpckhbw xmm4, xmm1
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm1
+ punpckhwd xmm5, xmm1
+
+ paddd xmm0, xmm2
+ movdqa xmm2, [edx + esi] // previous row above.
+ paddd xmm2, xmm0
+
+ paddd xmm0, xmm3
+ movdqa xmm3, [edx + esi + 16]
+ paddd xmm3, xmm0
+
+ paddd xmm0, xmm4
+ movdqa xmm4, [edx + esi + 32]
+ paddd xmm4, xmm0
+
+ paddd xmm0, xmm5
+ movdqa xmm5, [edx + esi + 48]
+ paddd xmm5, xmm0
+
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm3
+ movdqa [edx + 32], xmm4
+ movdqa [edx + 48], xmm5
+
+ lea edx, [edx + 64]
+ sub ecx, 4
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
+ lea eax, [eax + 4]
+ punpcklbw xmm2, xmm1
+ punpcklwd xmm2, xmm1
+ paddd xmm0, xmm2
+ movdqu xmm2, [edx + esi]
+ paddd xmm2, xmm0
+ movdqu [edx], xmm2
+ lea edx, [edx + 16]
+ sub ecx, 1
+ jge l1
+
+ l1b:
+ }
+}
+#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
+
+#ifdef HAS_ARGBAFFINEROW_SSE2
+// Copy ARGB pixels from source image with slope to a row of destination.
+__declspec(naked) __declspec(align(16))
+LIBYUV_API
+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
+ uint8* dst_argb, const float* uv_dudv, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 12] // src_argb
+ mov esi, [esp + 16] // stride
+ mov edx, [esp + 20] // dst_argb
+ mov ecx, [esp + 24] // pointer to uv_dudv
+ movq xmm2, qword ptr [ecx] // uv
+ movq xmm7, qword ptr [ecx + 8] // dudv
+ mov ecx, [esp + 28] // width
+ shl esi, 16 // 4, stride
+ add esi, 4
+ movd xmm5, esi
+ sub ecx, 4
+ jl l4b
+
+ // setup for 4 pixel loop
+ pshufd xmm7, xmm7, 0x44 // dup dudv
+ pshufd xmm5, xmm5, 0 // dup 4, stride
+ movdqa xmm0, xmm2 // x0, y0, x1, y1
+ addps xmm0, xmm7
+ movlhps xmm2, xmm0
+ movdqa xmm4, xmm7
+ addps xmm4, xmm4 // dudv *= 2
+ movdqa xmm3, xmm2 // x2, y2, x3, y3
+ addps xmm3, xmm4
+ addps xmm4, xmm4 // dudv *= 4
+
+ // 4 pixel loop
+ align 4
+ l4:
+ cvttps2dq xmm0, xmm2 // x, y float to int first 2
+ cvttps2dq xmm1, xmm3 // x, y float to int next 2
+ packssdw xmm0, xmm1 // x, y as 8 shorts
+ pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd xmm1, [eax + esi] // read pixel 0
+ movd xmm6, [eax + edi] // read pixel 1
+ punpckldq xmm1, xmm6 // combine pixel 0 and 1
+ addps xmm2, xmm4 // x, y += dx, dy first 2
+ movq qword ptr [edx], xmm1
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // shift right
+ movd edi, xmm0
+ movd xmm6, [eax + esi] // read pixel 2
+ movd xmm0, [eax + edi] // read pixel 3
+ punpckldq xmm6, xmm0 // combine pixel 2 and 3
+ addps xmm3, xmm4 // x, y += dx, dy next 2
+ sub ecx, 4
+ movq qword ptr 8[edx], xmm6
+ lea edx, [edx + 16]
+ jge l4
+
+ l4b:
+ add ecx, 4 - 1
+ jl l1b
+
+ // 1 pixel loop
+ align 4
+ l1:
+ cvttps2dq xmm0, xmm2 // x, y float to int
+ packssdw xmm0, xmm0 // x, y as shorts
+ pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
+ addps xmm2, xmm7 // x, y += dx, dy
+ movd esi, xmm0
+ movd xmm0, [eax + esi] // copy a pixel
+ sub ecx, 1
+ movd [edx], xmm0
+ lea edx, [edx + 4]
+ jge l1
+ l1b:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBAFFINEROW_SSE2
+
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ shr eax, 1
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 128. Blend 100 / 0.
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
+
+ movd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+
+ align 16
+ xloop:
+ movdqa xmm0, [esi]
+ movdqa xmm2, [esi + edx]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 16
+ xloop25:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 16
+ xloop50:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 16
+ xloop75:
+ movdqa xmm1, [esi]
+ movdqa xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 16
+ xloop100:
+ movdqa xmm0, [esi]
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ cmp eax, 64
+ je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ cmp eax, 192
+ je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
+
+ movd xmm5, eax // xmm5 = y fraction
+ punpcklbw xmm5, xmm5
+ psrlw xmm5, 1
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ punpcklqdq xmm5, xmm5
+ pxor xmm4, xmm4
+
+ align 16
+ xloop:
+ movdqa xmm0, [esi] // row0
+ movdqa xmm2, [esi + edx] // row1
+ movdqa xmm1, xmm0
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ psubw xmm2, xmm0 // row1 - row0
+ psubw xmm3, xmm1
+ paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
+ paddw xmm3, xmm3
+ pmulhw xmm2, xmm5 // scale diff
+ pmulhw xmm3, xmm5
+ paddw xmm0, xmm2 // sum rows
+ paddw xmm1, xmm3
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 16
+ xloop25:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 16
+ xloop50:
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 16
+ xloop75:
+ movdqa xmm1, [esi]
+ movdqa xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 16
+ xloop100:
+ movdqa xmm0, [esi]
+ sub ecx, 16
+ movdqa [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ shr eax, 1
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 128. Blend 100 / 0.
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
+
+ movd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ movd xmm5, eax // low fraction 128..1
+ punpcklbw xmm5, xmm0
+ punpcklwd xmm5, xmm5
+ pshufd xmm5, xmm5, 0
+
+ align 16
+ xloop:
+ movdqu xmm0, [esi]
+ movdqu xmm2, [esi + edx]
+ movdqu xmm1, xmm0
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm1, xmm5
+ psrlw xmm0, 7
+ psrlw xmm1, 7
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 16
+ xloop25:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 16
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 16
+ xloop75:
+ movdqu xmm1, [esi]
+ movdqu xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 16
+ xloop100:
+ movdqu xmm0, [esi]
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#ifdef HAS_INTERPOLATEROW_SSE2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ sub edi, esi
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 256. Blend 100 / 0.
+ cmp eax, 64
+ je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
+ cmp eax, 128
+ je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
+ cmp eax, 192
+ je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
+
+ movd xmm5, eax // xmm5 = y fraction
+ punpcklbw xmm5, xmm5
+ psrlw xmm5, 1
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ punpcklqdq xmm5, xmm5
+ pxor xmm4, xmm4
+
+ align 16
+ xloop:
+ movdqu xmm0, [esi] // row0
+ movdqu xmm2, [esi + edx] // row1
+ movdqu xmm1, xmm0
+ movdqu xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ psubw xmm2, xmm0 // row1 - row0
+ psubw xmm3, xmm1
+ paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
+ paddw xmm3, xmm3
+ pmulhw xmm2, xmm5 // scale diff
+ pmulhw xmm3, xmm5
+ paddw xmm0, xmm2 // sum rows
+ paddw xmm1, xmm3
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 16
+ xloop25:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 16
+ xloop50:
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + edx]
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 16
+ xloop75:
+ movdqu xmm1, [esi]
+ movdqu xmm0, [esi + edx]
+ pavgb xmm0, xmm1
+ pavgb xmm0, xmm1
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 16
+ xloop100:
+ movdqu xmm0, [esi]
+ sub ecx, 16
+ movdqu [esi + edi], xmm0
+ lea esi, [esi + 16]
+ jg xloop100
+
+ xloop99:
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_SSE2
+
+__declspec(naked) __declspec(align(16))
+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // src_uv_stride
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ sub edi, eax
+
+ align 16
+ convertloop:
+ movdqa xmm0, [eax]
+ pavgb xmm0, [eax + edx]
+ sub ecx, 16
+ movdqa [eax + edi], xmm0
+ lea eax, [eax + 16]
+ jg convertloop
+ pop edi
+ ret
+ }
+}
+
+#ifdef HAS_HALFROW_AVX2
+__declspec(naked) __declspec(align(16))
+void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ __asm {
+ push edi
+ mov eax, [esp + 4 + 4] // src_uv
+ mov edx, [esp + 4 + 8] // src_uv_stride
+ mov edi, [esp + 4 + 12] // dst_v
+ mov ecx, [esp + 4 + 16] // pix
+ sub edi, eax
+
+ align 16
+ convertloop:
+ vmovdqu ymm0, [eax]
+ vpavgb ymm0, ymm0, [eax + edx]
+ sub ecx, 32
+ vmovdqu [eax + edi], ymm0
+ lea eax, [eax + 32]
+ jg convertloop
+
+ pop edi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_HALFROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_bayer
+ movd xmm5, [esp + 12] // selector
+ mov ecx, [esp + 16] // pix
+ pshufd xmm5, xmm5, 0
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ punpckldq xmm0, xmm1
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+ ret
+ }
+}
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_bayer
+ mov ecx, [esp + 12] // shuffler
+ movdqa xmm5, [ecx]
+ mov ecx, [esp + 16] // pix
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ sub ecx, 8
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_bayer
+ mov ecx, [esp + 12] // shuffler
+ movdqa xmm5, [ecx]
+ mov ecx, [esp + 16] // pix
+
+ align 16
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm5
+ pshufb xmm1, xmm5
+ sub ecx, 8
+ movdqu [edx], xmm0
+ movdqu [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+ ret
+ }
+}
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_bayer
+ mov ecx, [esp + 12] // shuffler
+ vmovdqa xmm5, [ecx]
+ vpermq ymm5, ymm5, 0x44 // same shuffle in high as low.
+ mov ecx, [esp + 16] // pix
+
+ align 16
+ wloop:
+ vmovdqu ymm0, [eax]
+ vmovdqu ymm1, [eax + 32]
+ lea eax, [eax + 64]
+ vpshufb ymm0, ymm0, ymm5
+ vpshufb ymm1, ymm1, ymm5
+ sub ecx, 16
+ vmovdqu [edx], ymm0
+ vmovdqu [edx + 32], ymm1
+ lea edx, [edx + 64]
+ jg wloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif
+
+// YUY2 - Macro-pixel = 2 image pixels
+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
+
+// UYVY - Macro-pixel = 2 image pixels
+// U0Y0V0Y1
+
+__declspec(naked) __declspec(align(16))
+void I422ToYUY2Row_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ align 16
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm2 // YUYV
+ punpckhbw xmm1, xmm2
+ movdqu [edi], xmm0
+ movdqu [edi + 16], xmm1
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+__declspec(naked) __declspec(align(16))
+void I422ToUYVYRow_SSE2(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_frame, int width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_y
+ mov esi, [esp + 8 + 8] // src_u
+ mov edx, [esp + 8 + 12] // src_v
+ mov edi, [esp + 8 + 16] // dst_frame
+ mov ecx, [esp + 8 + 20] // width
+ sub edx, esi
+
+ align 16
+ convertloop:
+ movq xmm2, qword ptr [esi] // U
+ movq xmm3, qword ptr [esi + edx] // V
+ lea esi, [esi + 8]
+ punpcklbw xmm2, xmm3 // UV
+ movdqu xmm0, [eax] // Y
+ movdqa xmm1, xmm2
+ lea eax, [eax + 16]
+ punpcklbw xmm1, xmm0 // UYVY
+ punpckhbw xmm2, xmm0
+ movdqu [edi], xmm1
+ movdqu [edi + 16], xmm2
+ lea edi, [edi + 32]
+ sub ecx, 16
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/row_x86.asm b/chromium/third_party/libyuv/source/row_x86.asm
new file mode 100644
index 00000000000..80a9716bae2
--- /dev/null
+++ b/chromium/third_party/libyuv/source/row_x86.asm
@@ -0,0 +1,146 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
+%include "x86inc.asm"
+
+SECTION .text
+
+; cglobal numeric constants are parameters, gpr regs, mm regs
+
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
+
+%macro YUY2TOYROW 2-3
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
+%ifidn %1,YUY2
+ pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
+ psrlw m2, m2, 8
+%endif
+
+ ALIGN 16
+.convertloop:
+ mov%2 m0, [src_yuy2q]
+ mov%2 m1, [src_yuy2q + mmsize]
+ lea src_yuy2q, [src_yuy2q + mmsize * 2]
+%ifidn %1,YUY2
+ pand m0, m0, m2 ; YUY2 even bytes are Y
+ pand m1, m1, m2
+%else
+ psrlw m0, m0, 8 ; UYVY odd bytes are Y
+ psrlw m1, m1, 8
+%endif
+ packuswb m0, m0, m1
+%if cpuflag(AVX2)
+ vpermq m0, m0, 0xd8
+%endif
+ sub pixd, mmsize
+ mov%2 [dst_yq], m0
+ lea dst_yq, [dst_yq + mmsize]
+ jg .convertloop
+ REP_RET
+%endmacro
+
+; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
+INIT_MMX MMX
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_XMM SSE2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_YMM AVX2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW UYVY,a,
+
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
+
+%macro SplitUVRow 1-2
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+ pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
+ psrlw m4, m4, 8
+ sub dst_vq, dst_uq
+
+ ALIGN 16
+.convertloop:
+ mov%1 m0, [src_uvq]
+ mov%1 m1, [src_uvq + mmsize]
+ lea src_uvq, [src_uvq + mmsize * 2]
+ psrlw m2, m0, 8 ; odd bytes
+ psrlw m3, m1, 8
+ pand m0, m0, m4 ; even bytes
+ pand m1, m1, m4
+ packuswb m0, m0, m1
+ packuswb m2, m2, m3
+%if cpuflag(AVX2)
+ vpermq m0, m0, 0xd8
+ vpermq m2, m2, 0xd8
+%endif
+ mov%1 [dst_uq], m0
+ mov%1 [dst_uq + dst_vq], m2
+ lea dst_uq, [dst_uq + mmsize]
+ sub pixd, mmsize
+ jg .convertloop
+ REP_RET
+%endmacro
+
+INIT_MMX MMX
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_XMM SSE2
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_YMM AVX2
+SplitUVRow a,
+
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+; int width);
+
+%macro MergeUVRow_ 1-2
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+ sub src_vq, src_uq
+
+ ALIGN 16
+.convertloop:
+ mov%1 m0, [src_uq]
+ mov%1 m1, [src_vq]
+ lea src_uq, [src_uq + mmsize]
+ punpcklbw m2, m0, m1 // first 8 UV pairs
+ punpckhbw m0, m0, m1 // next 8 UV pairs
+%if cpuflag(AVX2)
+ vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
+ vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
+ mov%1 [dst_uvq], m1
+ mov%1 [dst_uvq + mmsize], m2
+%else
+ mov%1 [dst_uvq], m2
+ mov%1 [dst_uvq + mmsize], m0
+%endif
+ lea dst_uvq, [dst_uvq + mmsize * 2]
+ sub pixd, mmsize
+ jg .convertloop
+ REP_RET
+%endmacro
+
+INIT_MMX MMX
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_XMM SSE2
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_YMM AVX2
+MergeUVRow_ a,
+
diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc
new file mode 100644
index 00000000000..77af420b3f3
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale.cc
@@ -0,0 +1,2455 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyPlane
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+static __inline int Half(int v) {
+ return v >= 0 ? ((v + 1) >> 1) : -((-v + 1) >> 1);
+}
+
+// Note: Some SSE2 reference manuals
+// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
+
+// Set the following flag to true to revert to only
+// using the reference implementation ScalePlaneBox(), and
+// NOT the optimized versions. Useful for debugging and
+// when comparing the quality of the resulting YUV planes
+// as produced by the optimized and non-optimized versions.
+static bool use_reference_impl_ = false;
+
+LIBYUV_API
+void SetUseReferenceImpl(bool use) {
+ use_reference_impl_ = use;
+}
+
+// ScaleRowDown2Int also used by planar functions
+// NEON downscalers with interpolation.
+
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEROWDOWN2_NEON
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+
+#define HAS_SCALEROWDOWN4_NEON
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+#define HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+#define HAS_SCALEROWDOWN38_NEON
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+// SSE2 downscalers with interpolation.
+// Constants for SSSE3 code
+#elif !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
+// GCC 4.2 on OSX has link error when passing static or const to inline.
+// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
+
+// Offsets for source bytes 0 to 9
+CONST uvec8 kShuf0 =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+CONST uvec8 kShuf1 =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+CONST uvec8 kShuf2 =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+CONST uvec8 kShuf01 =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+CONST uvec8 kShuf11 =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+CONST uvec8 kShuf21 =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+CONST uvec8 kMadd01 =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+CONST uvec8 kMadd11 =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+CONST uvec8 kMadd21 =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+CONST vec16 kRound34 =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+CONST uvec8 kShuf38a =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+CONST uvec8 kShuf38b =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+CONST uvec8 kShufAc =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+CONST uvec8 kShufAc3 =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+CONST uvec16 kScaleAc33 =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+CONST uvec8 kShufAb0 =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+CONST uvec8 kShufAb1 =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+CONST uvec8 kShufAb2 =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+CONST uvec16 kScaleAb2 =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_SCALEROWDOWN2_SSE2
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ align 16
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 16
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
+ psrld xmm5, 24
+ pslld xmm5, 16
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
+ psrlw xmm7, 8
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, [eax + esi * 2]
+ movdqa xmm3, [eax + esi * 2 + 16]
+ movdqa xmm4, [eax + edi]
+ movdqa xmm5, [eax + edi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm7
+ pand xmm3, xmm7
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
+ psrlw xmm0, 8
+ pand xmm2, xmm7
+ pavgw xmm0, xmm2
+ packuswb xmm0, xmm0
+
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm3, kShuf0
+ movdqa xmm4, kShuf1
+ movdqa xmm5, kShuf2
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm1
+ palignr xmm1, xmm0, 8
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ sub ecx, 24
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx + 24]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ sub ecx, 24
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx+24]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm4, kShuf38a
+ movdqa xmm5, kShuf38b
+
+ align 16
+ xloop:
+ movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
+ paddusb xmm0, xmm1
+
+ sub ecx, 12
+ movq qword ptr [edx], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edx + 8], xmm1
+ lea edx, [edx + 12]
+ jg xloop
+
+ ret
+ }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAc
+ movdqa xmm3, kShufAc3
+ movdqa xmm4, kScaleAc33
+ pxor xmm5, xmm5
+
+ align 16
+ xloop:
+ movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqa xmm6, [eax + esi]
+ movhlps xmm1, xmm0
+ movhlps xmm7, xmm6
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+ movdqa xmm6, [eax + esi * 2]
+ lea eax, [eax + 16]
+ movhlps xmm7, xmm6
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ pshufb xmm6, xmm2
+
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ pshufb xmm7, xmm3
+ paddusw xmm6, xmm7
+
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ packuswb xmm6, xmm6
+
+ sub ecx, 6
+ movd [edx], xmm6 // write 6 pixels
+ psrlq xmm6, 16
+ movd [edx + 2], xmm6
+ lea edx, [edx + 6]
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAb0
+ movdqa xmm3, kShufAb1
+ movdqa xmm4, kShufAb2
+ movdqa xmm5, kScaleAb2
+
+ align 16
+ xloop:
+ movdqa xmm0, [eax] // average 2 rows into xmm0
+ pavgb xmm0, [eax + esi]
+ lea eax, [eax + 16]
+
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ pshufb xmm1, xmm2
+ movdqa xmm6, xmm0
+ pshufb xmm6, xmm3
+ paddusw xmm1, xmm6
+ pshufb xmm0, xmm4
+ paddusw xmm1, xmm0
+
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ packuswb xmm1, xmm1
+
+ sub ecx, 6
+ movd [edx], xmm1 // write 6 pixels
+ psrlq xmm1, 16
+ movd [edx + 2], xmm1
+ lea edx, [edx + 6]
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+#define HAS_SCALEADDROWS_SSE2
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+__declspec(naked) __declspec(align(16))
+static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov esi, [esp + 16 + 4] // src_ptr
+ mov edx, [esp + 16 + 8] // src_stride
+ mov edi, [esp + 16 + 12] // dst_ptr
+ mov ecx, [esp + 16 + 16] // dst_width
+ mov ebx, [esp + 16 + 20] // height
+ pxor xmm4, xmm4
+ dec ebx
+
+ align 16
+ xloop:
+ // first row
+ movdqa xmm0, [esi]
+ lea eax, [esi + edx]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ lea esi, [esi + 16]
+ mov ebp, ebx
+ test ebp, ebp
+ je ydone
+
+ // sum remaining rows
+ align 16
+ yloop:
+ movdqa xmm2, [eax] // read 16 pixels
+ lea eax, [eax + edx] // advance to next row
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm1, xmm3
+ sub ebp, 1
+ jg yloop
+ ydone:
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm1
+ lea edi, [edi + 32]
+
+ sub ecx, 16
+ jg xloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEROWDOWN2_SSE2
+static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%3,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqu (%0),%%xmm0 \n"
+ "movdqu 0x10(%0),%%xmm1 \n"
+ "movdqu (%0,%3,1),%%xmm2 \n"
+ "movdqu 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+#define HAS_SCALEROWDOWN4_SSE2
+static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stridex3 = 0;
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0x8,%%xmm7 \n"
+ "lea (%4,%4,2),%3 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%4,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,1),%%xmm3 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa (%0,%4,2),%%xmm2 \n"
+ "movdqa 0x10(%0,%4,2),%%xmm3 \n"
+ "movdqa (%0,%3,1),%%xmm4 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm5 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pand %%xmm7,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "lea 0x8(%1),%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(stridex3) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+#endif
+ );
+}
+
+#define HAS_SCALEROWDOWN34_SSSE3
+static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm2 \n"
+ "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0,(%1) \n"
+ "movq %%xmm1,0x8(%1) \n"
+ "movq %%xmm2,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa (%0,%3),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3),%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "movdqa 0x10(%0,%3),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa (%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,(%1) \n"
+ "movdqu 0x8(%0),%%xmm6 \n"
+ "movdqu 0x8(%0,%3,1),%%xmm7 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x8(%1) \n"
+ "movdqa 0x10(%0),%%xmm6 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm7 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6,0x10(%1) \n"
+ "lea 0x18(%1),%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+#define HAS_SCALEROWDOWN38_SSSE3
+static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1,0x8(%1) \n"
+ "lea 0xc(%1),%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm4", "xmm5"
+#endif
+ );
+}
+
+static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "pavgb (%0,%3,1),%%xmm0 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm1,(%1) \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3,1),%%xmm6 \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa (%0,%3,2),%%xmm6 \n"
+ "lea 0x10(%0),%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm6,(%1) \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6,0x2(%1) \n"
+ "lea 0x6(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+#define HAS_SCALEADDROWS_SSE2
+static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ int tmp_height = 0;
+ intptr_t tmp_src = 0;
+ asm volatile (
+ "pxor %%xmm4,%%xmm4 \n"
+ "sub $0x1,%5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "mov %0,%3 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "mov %5,%2 \n"
+ "test %2,%2 \n"
+ "je 3f \n"
+ "2: \n"
+ "movdqa (%0),%%xmm2 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "sub $0x1,%2 \n"
+ "jg 2b \n"
+ "3: \n"
+ "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm1,0x10(%1) \n"
+ "lea 0x10(%3),%0 \n"
+ "lea 0x20(%1),%1 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_height), // %2
+ "+r"(tmp_src), // %3
+ "+r"(src_width), // %4
+ "+rm"(src_height) // %5
+ : "rm"(static_cast<intptr_t>(src_stride)) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+// CPU agnostic row functions
+static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+static void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ uint8* dend = dst + dst_width - 1;
+ do {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+ src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+ src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+ 8) >> 4;
+ dst += 2;
+ src_ptr += 8;
+ } while (dst < dend);
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ }
+}
+
+static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ uint8* dend = dst + dst_width;
+ do {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ } while (dst < dend);
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+static void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ uint8* dend = d + dst_width;
+ do {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ } while (d < dend);
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ uint8* dend = d + dst_width;
+ do {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ } while (d < dend);
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (static_cast<int>(a) + \
+ ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+
+static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+
+static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ assert(dst_width % 3 == 0);
+ for (int x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+// 8x3 -> 3x1
+static void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ intptr_t stride = src_stride;
+ for (int i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// 8x2 -> 3x1
+static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ intptr_t stride = src_stride;
+ for (int i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2]) * (65536 / 6) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5]) * (65536 / 6) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ assert(src_width > 0);
+ assert(src_height > 0);
+ for (int x = 0; x < src_width; ++x) {
+ const uint8* s = src_ptr + x;
+ int sum = 0;
+ for (int y = 0; y < src_height; ++y) {
+ sum += s[0];
+ s += src_stride;
+ }
+ dst_ptr[x] = sum;
+ }
+}
+
+// Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
+// its original size.
+
+static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown2Box_C : ScaleRowDown2_C;
+ int row_stride = src_stride << 1;
+ if (!filtering) {
+ src_ptr += src_stride; // Point to odd rows.
+ src_stride = 0;
+ }
+
+#if defined(HAS_SCALEROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_Unaligned_SSE2 :
+ ScaleRowDown2_Unaligned_SSE2;
+ if (IS_ALIGNED(src_ptr, 16) &&
+ IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleRowDown2 = filtering ? ScaleRowDown2Box_SSE2 : ScaleRowDown2_SSE2;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&
+ IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown2 = filtering ?
+ ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;
+ }
+#endif
+
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (int y = 0; y < dst_height; ++y) {
+ ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
+// its original size.
+
+static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) =
+ filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
+ int row_stride = src_stride << 2;
+ if (!filtering) {
+ src_ptr += src_stride * 2; // Point to row 2.
+ src_stride = 0;
+ }
+#if defined(HAS_SCALEROWDOWN4_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
+ }
+#elif defined(HAS_SCALEROWDOWN4_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;
+ }
+#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ ScaleRowDown4 = filtering ?
+ ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
+ }
+#endif
+
+ for (int y = 0; y < dst_height; ++y) {
+ ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += row_stride;
+ dst_ptr += dst_stride;
+ }
+}
+
+// Scale plane down, 3/4
+
+static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ assert(dst_width % 3 == 0);
+ void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_C;
+ ScaleRowDown34_1 = ScaleRowDown34_C;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;
+ }
+#if defined(HAS_SCALEROWDOWN34_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_NEON;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
+ } else {
+ ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;
+ ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ for (int y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+ dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+
+// Scale plane, 3/8
+// This is an optimized version for scaling down a plane to 3/8
+// of its original size.
+//
+// Uses box filter arranges like this
+// aaabbbcc -> abc
+// aaabbbcc def
+// aaabbbcc ghi
+// dddeeeff
+// dddeeeff
+// dddeeeff
+// ggghhhii
+// ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
+
+static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ assert(dst_width % 3 == 0);
+ void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_C;
+ ScaleRowDown38_2 = ScaleRowDown38_C;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;
+ }
+#if defined(HAS_SCALEROWDOWN38_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_NEON;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN38_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;
+ }
+ }
+#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
+ IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+ if (!filtering) {
+ ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
+ } else {
+ ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;
+ ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ for (int y = 0; y < dst_height - 2; y += 3) {
+ ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 2;
+ dst_ptr += dst_stride;
+ }
+
+ // Remainder 1 or 2 rows with last row vertically unfiltered
+ if ((dst_height % 3) == 2) {
+ ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ src_ptr += src_stride * 3;
+ dst_ptr += dst_stride;
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ } else if ((dst_height % 3) == 1) {
+ ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
+ }
+}
+
+static __inline uint32 SumBox(int iboxwidth, int iboxheight,
+ ptrdiff_t src_stride, const uint8* src_ptr) {
+ assert(iboxwidth > 0);
+ assert(iboxheight > 0);
+ uint32 sum = 0u;
+ for (int y = 0; y < iboxheight; ++y) {
+ for (int x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ src_ptr += src_stride;
+ }
+ return sum;
+}
+
+static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
+ int x, int dx, ptrdiff_t src_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ for (int i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ int boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
+ (boxwidth * boxheight);
+ }
+}
+
+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
+ assert(iboxwidth > 0);
+ uint32 sum = 0u;
+ for (int x = 0; x < iboxwidth; ++x) {
+ sum += src_ptr[x];
+ }
+ return sum;
+}
+
+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) {
+ int scaletbl[2];
+ int minboxwidth = (dx >> 16);
+ scaletbl[0] = 65536 / (minboxwidth * boxheight);
+ scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
+ int* scaleptr = scaletbl - minboxwidth;
+ for (int i = 0; i < dst_width; ++i) {
+ int ix = x >> 16;
+ x += dx;
+ int boxwidth = (x >> 16) - ix;
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
+ }
+}
+
+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
+ const uint16* src_ptr, uint8* dst_ptr) {
+ int boxwidth = (dx >> 16);
+ int scaleval = 65536 / (boxwidth * boxheight);
+ for (int i = 0; i < dst_width; ++i) {
+ *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
+ x += boxwidth;
+ }
+}
+
+// Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
+//
+// Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
+// averaging.
+
+static void ScalePlaneBox(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ int dx = (Abs(src_width) << 16) / dst_width;
+ int dy = (src_height << 16) / dst_height;
+ int x = 0;
+ int y = 0;
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ x += (dst_width - 1) * dx;
+ dx = -dx;
+ src_width = -src_width;
+ }
+ int maxy = (src_height << 16);
+ if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxStride) ||
+ dst_height * 2 > src_height) {
+ uint8* dst = dst_ptr;
+ for (int j = 0; j < dst_height; ++j) {
+ int iy = y >> 16;
+ const uint8* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > maxy) {
+ y = maxy;
+ }
+ int boxheight = (y >> 16) - iy;
+ ScalePlaneBoxRow_C(dst_width, boxheight,
+ x, dx, src_stride,
+ src, dst);
+ dst += dst_stride;
+ }
+ } else {
+ SIMD_ALIGNED(uint16 row[kMaxStride]);
+ void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) =
+ ScaleAddRows_C;
+ void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
+ const uint16* src_ptr, uint8* dst_ptr);
+ if (dx & 0xffff) {
+ ScaleAddCols = ScaleAddCols2_C;
+ } else {
+ ScaleAddCols = ScaleAddCols1_C;
+ }
+#if defined(HAS_SCALEADDROWS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ ScaleAddRows = ScaleAddRows_SSE2;
+ }
+#endif
+
+ for (int j = 0; j < dst_height; ++j) {
+ int iy = y >> 16;
+ const uint8* src = src_ptr + iy * src_stride;
+ y += dy;
+ if (y > (src_height << 16)) {
+ y = (src_height << 16);
+ }
+ int boxheight = (y >> 16) - iy;
+ ScaleAddRows(src, src_stride, row, src_width, boxheight);
+ ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
+ dst_ptr += dst_stride;
+ }
+ }
+}
+
+// Scale plane to/from any dimensions, with bilinear interpolation.
+
+void ScalePlaneBilinear(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ assert(Abs(src_width) <= kMaxStride);
+
+ SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
+
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(src_width, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+ int dx = 0;
+ int dy = 0;
+ int x = 0;
+ int y = 0;
+ if (dst_width <= Abs(src_width)) {
+ dx = (Abs(src_width) << 16) / dst_width;
+ x = (dx >> 1) - 32768;
+ } else if (dst_width > 1) {
+ dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ x += (dst_width - 1) * dx;
+ dx = -dx;
+ src_width = -src_width;
+ }
+ if (dst_height <= src_height) {
+ dy = (src_height << 16) / dst_height;
+ y = (dy >> 1) - 32768;
+ } else if (dst_height > 1) {
+ dy = ((src_height - 1) << 16) / (dst_height - 1);
+ }
+ int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ for (int j = 0; j < dst_height; ++j) {
+ if (y > maxy) {
+ y = maxy;
+ }
+ int yi = y >> 16;
+ int yf = (y >> 8) & 255;
+ const uint8* src = src_ptr + yi * src_stride;
+ InterpolateRow(row, src, src_stride, src_width, yf);
+ ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+// Scale plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScalePlaneSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr) {
+ int dx = (Abs(src_width) << 16) / dst_width;
+ int dy = (src_height << 16) / dst_height;
+ int x = dx >> 1;
+ int y = dy >> 1;
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ x += (dst_width - 1) * dx;
+ dx = -dx;
+ src_width = -src_width;
+ }
+
+ for (int j = 0; j < dst_height; ++j) {
+ int xs = x;
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+ uint8* dst = dst_ptr;
+ for (int i = 0; i < dst_width; ++i) {
+ *dst++ = src[xs >> 16];
+ xs += dx;
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+// Scale plane to/from any dimensions.
+
+static void ScalePlaneAnySize(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ if (!filtering || src_width > kMaxStride) {
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ } else {
+ ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ }
+}
+
+// Scale plane down, any size
+//
+// This is an optimized version for scaling down a plane to any size.
+// The current implementation is ~10 times faster compared to the
+// reference implementation for e.g. XGA->LowResPAL
+
+static void ScalePlaneDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ if (!filtering || src_width > kMaxStride) {
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ } else if (filtering == kFilterBilinear || dst_height * 2 > src_height) {
+ // between 1/2x and 1x use bilinear
+ ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ } else {
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src_ptr, dst_ptr);
+ }
+}
+
+// Scale a plane.
+// This function in turn calls a scaling function suitable for handling
+// the desired resolutions.
+
+LIBYUV_API
+void ScalePlane(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+ // Use specialized scales to improve performance for common resolutions.
+ // For example, all the 1/2 scalings will use ScalePlaneDown2()
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
+ } else if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ // Scale down.
+ if (use_reference_impl_) {
+ // For testing, allow the optimized versions to be disabled.
+ ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else if (4 * dst_width == 3 * src_width &&
+ 4 * dst_height == 3 * src_height) {
+ // optimized, 3/4
+ ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ // optimized, 1/2
+ ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ // 3/8 rounded up for odd sized chroma height.
+ } else if (8 * dst_width == 3 * src_width &&
+ dst_height == ((src_height * 3 + 7) / 8)) {
+ // optimized, 3/8
+ ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ filtering != kFilterBilinear) {
+ // optimized, 1/4
+ ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ } else {
+ // Arbitrary downsample
+ ScalePlaneDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ }
+ } else {
+ // Arbitrary scale up and/or down.
+ ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ }
+}
+
+// Scale an I420 image.
+// This function in turn calls a scaling function for each plane.
+// TODO(fbarchard): Disable UNDER_ALLOCATED_HACK
+#define UNDER_ALLOCATED_HACK 1
+
+LIBYUV_API
+int I420Scale(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+ if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 ||
+ src_width > 32767 || src_height > 32767) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ int halfheight = Half(src_height);
+ src_y = src_y + (src_height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ int src_halfwidth = Half(src_width);
+ int src_halfheight = Half(src_height);
+ int dst_halfwidth = Half(dst_width);
+ int dst_halfheight = Half(dst_height);
+
+#ifdef UNDER_ALLOCATED_HACK
+ // If caller passed width / 2 for stride, adjust halfwidth to match.
+ if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) {
+ src_halfwidth = src_width >> 1;
+ }
+ if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) {
+ dst_halfwidth = dst_width >> 1;
+ }
+ // If caller used height / 2 when computing src_v, it will point into what
+ // should be the src_u plane. Detect this and reduce halfheight to match.
+ int uv_src_plane_size = src_halfwidth * src_halfheight;
+ if ((src_height & 1) &&
+ (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
+ src_halfheight = src_height >> 1;
+ }
+ int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
+ if ((dst_height & 1) &&
+ (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
+ dst_halfheight = dst_height >> 1;
+ }
+#endif
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height,
+ dst_y, dst_stride_y, dst_width, dst_height,
+ filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering);
+ return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
+ int src_stride_y, int src_stride_u, int src_stride_v,
+ int src_width, int src_height,
+ uint8* dst_y, uint8* dst_u, uint8* dst_v,
+ int dst_stride_y, int dst_stride_u, int dst_stride_v,
+ int dst_width, int dst_height,
+ bool interpolate) {
+ if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 ||
+ src_width > 32767 || src_height > 32767) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ int halfheight = Half(src_height);
+ src_y = src_y + (src_height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+ int src_halfwidth = Half(src_width);
+ int src_halfheight = Half(src_height);
+ int dst_halfwidth = Half(dst_width);
+ int dst_halfheight = Half(dst_height);
+ FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
+
+#ifdef UNDER_ALLOCATED_HACK
+ // If caller passed width / 2 for stride, adjust halfwidth to match.
+ if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) {
+ src_halfwidth = src_width >> 1;
+ }
+ if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) {
+ dst_halfwidth = dst_width >> 1;
+ }
+ // If caller used height / 2 when computing src_v, it will point into what
+ // should be the src_u plane. Detect this and reduce halfheight to match.
+ int uv_src_plane_size = src_halfwidth * src_halfheight;
+ if ((src_height & 1) &&
+ (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
+ src_halfheight = src_height >> 1;
+ }
+ int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
+ if ((dst_height & 1) &&
+ (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
+ dst_halfheight = dst_height >> 1;
+ }
+#endif
+
+ ScalePlane(src_y, src_stride_y, src_width, src_height,
+ dst_y, dst_stride_y, dst_width, dst_height,
+ filtering);
+ ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
+ dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
+ filtering);
+ ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
+ dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
+ filtering);
+ return 0;
+}
+
+// Deprecated api
+LIBYUV_API
+int ScaleOffset(const uint8* src, int src_width, int src_height,
+ uint8* dst, int dst_width, int dst_height, int dst_yoffset,
+ bool interpolate) {
+ if (!src || src_width <= 0 || src_height <= 0 ||
+ !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
+ src_width > 32767 || src_height > 32767 ||
+ dst_yoffset >= dst_height) {
+ return -1;
+ }
+ dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
+ int src_halfwidth = Half(src_width);
+ int src_halfheight = Half(src_height);
+ int dst_halfwidth = Half(dst_width);
+ int dst_halfheight = Half(dst_height);
+ int aheight = dst_height - dst_yoffset * 2; // actual output height
+ const uint8* src_y = src;
+ const uint8* src_u = src + src_width * src_height;
+ const uint8* src_v = src + src_width * src_height +
+ src_halfwidth * src_halfheight;
+ uint8* dst_y = dst + dst_yoffset * dst_width;
+ uint8* dst_u = dst + dst_width * dst_height +
+ (dst_yoffset >> 1) * dst_halfwidth;
+ uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
+ (dst_yoffset >> 1) * dst_halfwidth;
+ return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
+ src_width, src_height, dst_y, dst_u, dst_v, dst_width,
+ dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libyuv/source/scale_argb.cc
new file mode 100644
index 00000000000..5cf14d949ef
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_argb.cc
@@ -0,0 +1,1202 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ARGB scaling uses bilinear or point, but not box filter.
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_SCALEARGBROWDOWN2_SSE2
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ shufps xmm0, xmm1, 0xdd
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x2 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ align 16
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_argb
+ mov ecx, [esp + 8 + 20] // dst_width
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 16
+ wloop:
+ movd xmm0, [eax]
+ movd xmm1, [eax + ebx]
+ punpckldq xmm0, xmm1
+ movd xmm2, [eax + ebx * 2]
+ movd xmm3, [eax + edi]
+ lea eax, [eax + ebx * 4]
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop ebx
+ ret
+ }
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_argb
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 16
+ wloop:
+ movq xmm0, qword ptr [eax] // row0 4 pairs
+ movhps xmm0, qword ptr [eax + ebx]
+ movq xmm1, qword ptr [eax + ebx * 2]
+ movhps xmm1, qword ptr [eax + edi]
+ lea eax, [eax + ebx * 4]
+ movq xmm2, qword ptr [esi] // row1 4 pairs
+ movhps xmm2, qword ptr [esi + ebx]
+ movq xmm3, qword ptr [esi + ebx * 2]
+ movhps xmm3, qword ptr [esi + edi]
+ lea esi, [esi + ebx * 4]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Column scaling unfiltered. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+#define HAS_SCALEARGBCOLS_SSE2
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ align 16
+ xloop2:
+ paddd xmm2, xmm3 // x += dx
+ movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, qword ptr [esi + edx * 4] // 1 source x1 pixels
+ punpckldq xmm0, xmm1 // x0 x1
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels
+ movd [edi], xmm0
+ xloop99:
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static const uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static const uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+__declspec(naked) __declspec(align(16))
+static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+ movdqa xmm4, kShuffleColARGB
+ movdqa xmm5, kShuffleFractions
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ align 16
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ psrlw xmm1, 9 // 7 bit fractions.
+ movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
+ pshufb xmm1, xmm5 // 0000000011111111
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ psrlw xmm2, 9 // 7 bit fractions.
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ pshufb xmm2, xmm5 // 00000000
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
+ movd [edi], xmm0
+ xloop99:
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+#define HAS_SCALEARGBROWDOWN2_SSE2
+static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "lea 0x20(%0),%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ ".p2align 4 \n"
+ "1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa (%0,%3,1),%%xmm2 \n"
+ "movdqa 0x10(%0,%3,1),%%xmm3 \n"
+ "lea 0x20(%0),%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0,(%1) \n"
+ "lea 0x10(%1),%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ asm volatile (
+ "lea 0x0(,%1,4),%1 \n"
+ "lea (%1,%1,2),%4 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movd (%0),%%xmm0 \n"
+ "movd (%0,%1,1),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movd (%0,%1,2),%%xmm2 \n"
+ "movd (%0,%4,1),%%xmm3 \n"
+ "lea (%0,%1,4),%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "+r"(src_stepx_x12) // %4
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride, int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ intptr_t row1 = static_cast<intptr_t>(src_stride);
+ asm volatile (
+ "lea 0x0(,%1,4),%1 \n"
+ "lea (%1,%1,2),%4 \n"
+ "lea (%0,%5,1),%5 \n"
+ ".p2align 4 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movhps (%0,%1,1),%%xmm0 \n"
+ "movq (%0,%1,2),%%xmm1 \n"
+ "movhps (%0,%4,1),%%xmm1 \n"
+ "lea (%0,%1,4),%0 \n"
+ "movq (%5),%%xmm2 \n"
+ "movhps (%5,%1,1),%%xmm2 \n"
+ "movq (%5,%1,2),%%xmm3 \n"
+ "movhps (%5,%4,1),%%xmm3 \n"
+ "lea (%5,%1,4),%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0,(%2) \n"
+ "lea 0x10(%2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "+r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+#define HAS_SCALEARGBCOLS_SSE2
+static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0;
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ ".p2align 4 \n"
+ "2: \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movd (%1,%3,4),%%xmm0 \n"
+ "movd (%1,%4,4),%%xmm1 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "movd (%1,%3,4),%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+ "99: \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "+r"(x0), // %3
+ "+r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+#ifdef __APPLE__
+#define CONST
+#else
+#define CONST static const
+#endif
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+CONST uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+CONST uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0;
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
+
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ ".p2align 4 \n"
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "movq (%1,%3,4),%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ "movhps (%1,%4,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0,(%0) \n"
+ "lea 0x8(%0),%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "movq (%1,%3,4),%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,(%0) \n"
+ "99: \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "+r"(x0), // %3
+ "+r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // defined(__x86_64__) || defined(__i386__)
+
+static void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 4;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ for (int x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ for (int x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += src_stepx * 4;
+ dst_argb += 4;
+ }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) static_cast<uint32>( \
+ BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+ BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+static void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+
+static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ FilterMode filtering) {
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row / even column.
+ if (filtering) {
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ } else {
+ src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+ }
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C;
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 :
+ ScaleARGBRowDown2_SSE2;
+ }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+ ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
+ ScaleARGBRowDown2_NEON;
+ }
+#endif
+
+ // TODO(fbarchard): Loop through source height to allow odd height.
+ for (int y = 0; y < dst_height; ++y) {
+ ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ FilterMode filtering) {
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+ ScaleARGBRowDownEven_SSE2;
+ }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_argb, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+ ScaleARGBRowDownEven_NEON;
+ }
+#endif
+
+ for (int y = 0; y < dst_height; ++y) {
+ ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ assert(src_height > 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ int xlast = x + (dst_width - 1) * dx;
+ int xl = (dx >= 0) ? x : xlast;
+ int xr = (dx >= 0) ? xlast : x;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used.
+ int clip_src_width = (((xr - xl) + 1 + 3) & ~3) * 4; // Width aligned to 4.
+ src_argb += xl * 4;
+ x -= (xl << 16);
+ assert(clip_src_width <= kMaxStride);
+ // TODO(fbarchard): Remove clip_src_width alignment checks.
+ SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(clip_src_width, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+ int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ for (int j = 0; j < dst_height; ++j) {
+ if (y > maxy) {
+ y = maxy;
+ }
+ int yi = y >> 16;
+ int yf = (y >> 8) & 255;
+ const uint8* src = src_argb + yi * src_stride;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ assert(src_width > 0);
+ assert(src_height > 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ assert(dst_width * 4 <= kMaxStride);
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+#endif
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+ int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ if (y > maxy) {
+ y = maxy;
+ }
+ int yi = y >> 16;
+ const uint8* src = src_argb + yi * src_stride;
+ SIMD_ALIGNED(uint8 row[2 * kMaxStride]);
+ uint8* rowptr = row;
+ int rowstride = kMaxStride;
+ int lasty = yi;
+
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (int j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y <= maxy) {
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
+// interpolation off, and argb pixels instead of yuv.
+static void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// ScaleARGB ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) = ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBCols = ScaleARGBCols_SSE2;
+ }
+#endif
+
+ for (int i = 0; i < dst_height; ++i) {
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+ dst_width, x, dx);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// ScaleARGB ARGB to/from any dimensions.
+static void ScaleARGBAnySize(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int clip_width, int clip_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ FilterMode filtering) {
+ if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
+ ScaleARGBBilinearUp(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src_argb, dst_argb,
+ x, dx, y, dy);
+ return;
+ }
+ if (filtering && src_width * 4 < kMaxStride) {
+ ScaleARGBBilinearDown(src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src_argb, dst_argb,
+ x, dx, y, dy);
+ return;
+ }
+ ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src_argb, dst_argb,
+ x, dx, y, dy);
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ FilterMode filtering) {
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int dx = 0;
+ int dy = 0;
+ int x = 0;
+ int y = 0;
+ if (filtering) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ dx = (Abs(src_width) << 16) / dst_width;
+ x = (dx >> 1) - 32768;
+ } else if (dst_width > 1) {
+ dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
+ }
+ if (dst_height <= src_height) {
+ dy = (src_height << 16) / dst_height;
+ y = (dy >> 1) - 32768;
+ } else if (dst_height > 1) {
+ dy = ((src_height - 1) << 16) / (dst_height - 1);
+ }
+ } else {
+ // Scale step for point sampling duplicates all pixels equally.
+ dx = (Abs(src_width) << 16) / dst_width;
+ dy = (src_height << 16) / dst_height;
+ x = dx >> 1;
+ y = dy >> 1;
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ x += (dst_width - 1) * dx;
+ dx = -dx;
+ src_width = -src_width;
+ }
+ if (clip_x) {
+ x += clip_x * dx;
+ dst += clip_x * 4;
+ }
+ if (clip_y) {
+ y += clip_y * dy;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) {
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+ if ((dx >> 16) == 2) {
+ // Optimized 1/2 horizontal.
+ ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+ if (dst_width == src_width && dst_height == src_height) {
+ // Straight copy.
+ ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+ dst, dst_stride, clip_width, clip_height);
+ return;
+ }
+ }
+ }
+ }
+ // Arbitrary scale up and/or down.
+ ScaleARGBAnySize(src_width, src_height,
+ dst_width, dst_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst, x, dx, y, dy, filtering);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+ clip_x < 0 || clip_y < 0 ||
+ src_width > 32767 || src_height > 32767 ||
+ (clip_x + clip_width) > dst_width ||
+ (clip_y + clip_height) > dst_height) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height,
+ clip_x, clip_y, clip_width, clip_height, filtering);
+ return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+ src_width > 32767 || src_height > 32767) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height,
+ 0, 0, dst_width, dst_height, filtering);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_argb_neon.cc b/chromium/third_party/libyuv/source/scale_argb_neon.cc
new file mode 100644
index 00000000000..51b00872441
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_argb_neon.cc
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.32 {q0, q1}, [%0]! \n"
+ "vld2.32 {q2, q3}, [%0]! \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "vst1.8 {q3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %3, lsl #2 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_mips.cc b/chromium/third_party/libyuv/source/scale_mips.cc
new file mode 100644
index 00000000000..cfd48b5b053
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_mips.cc
@@ -0,0 +1,638 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC MIPS DSPR2
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__(
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 4 \n" // iterations -> by 16
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ // TODO(fbarchard): Use odd pixels instead of even.
+ "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0|
+ "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8|
+ "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16|
+ "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t8, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t1, 8(%[dst]) \n"
+ "sw $t2, 12(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 16 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0xf \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t0, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 2 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* t = src_ptr + src_stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n" // iterations -> step 8
+ "bltz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 0(%[t]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[t]) \n" // |23|22|21|20|
+ "lw $t6, 8(%[t]) \n" // |27|26|25|24|
+ "lw $t7, 12(%[t]) \n" // |31|30|29|28|
+ "addiu $t9, $t9, -1 \n"
+ "srl $t8, $t0, 16 \n" // |X|X|3|2|
+ "ins $t0, $t4, 16, 16 \n" // |17|16|1|0|
+ "ins $t4, $t8, 0, 16 \n" // |19|18|3|2|
+ "raddu.w.qb $t0, $t0 \n" // |17+16+1+0|
+ "raddu.w.qb $t4, $t4 \n" // |19+18+3+2|
+ "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2
+ "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2
+ "srl $t8, $t1, 16 \n" // |X|X|7|6|
+ "ins $t1, $t5, 16, 16 \n" // |21|20|5|4|
+ "ins $t5, $t8, 0, 16 \n" // |22|23|7|6|
+ "raddu.w.qb $t1, $t1 \n" // |21+20+5+4|
+ "raddu.w.qb $t5, $t5 \n" // |23+22+7+6|
+ "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2
+ "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2
+ "srl $t8, $t2, 16 \n" // |X|X|11|10|
+ "ins $t2, $t6, 16, 16 \n" // |25|24|9|8|
+ "ins $t6, $t8, 0, 16 \n" // |27|26|11|10|
+ "raddu.w.qb $t2, $t2 \n" // |25+24+9+8|
+ "raddu.w.qb $t6, $t6 \n" // |27+26+11+10|
+ "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2
+ "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2
+ "srl $t8, $t3, 16 \n" // |X|X|15|14|
+ "ins $t3, $t7, 16, 16 \n" // |29|28|13|12|
+ "ins $t7, $t8, 0, 16 \n" // |31|30|15|14|
+ "raddu.w.qb $t3, $t3 \n" // |29+28+13+12|
+ "raddu.w.qb $t7, $t7 \n" // |31+30+15+14|
+ "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2
+ "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2
+ "addiu %[src_ptr], %[src_ptr], 16 \n"
+ "addiu %[t], %[t], 16 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "sb $t1, 2(%[dst]) \n"
+ "sb $t5, 3(%[dst]) \n"
+ "sb $t2, 4(%[dst]) \n"
+ "sb $t6, 5(%[dst]) \n"
+ "sb $t3, 6(%[dst]) \n"
+ "sb $t7, 7(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 0x7 \n" // x = residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lwr $t1, 0(%[src_ptr]) \n"
+ "lwl $t1, 3(%[src_ptr]) \n"
+ "lwr $t2, 0(%[t]) \n"
+ "lwl $t2, 3(%[t]) \n"
+ "srl $t8, $t1, 16 \n"
+ "ins $t1, $t2, 16, 16 \n"
+ "ins $t2, $t8, 0, 16 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "shra_r.w $t1, $t1, 2 \n"
+ "shra_r.w $t2, $t2, 2 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "sb $t2, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -2 \n"
+ "addiu %[t], %[t], 4 \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 2 \n"
+
+ "3: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst), [t] "+r" (t)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 3 \n"
+ "beqz $t9, 2f \n"
+ " nop \n"
+
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
+ "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
+ "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
+ "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
+ "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t5, 4(%[dst]) \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 8 \n"
+
+ "2: \n"
+ "andi $t9, %[dst_width], 7 \n" // residue
+ "beqz $t9, 3f \n"
+ " nop \n"
+
+ "21: \n"
+ "lbu $t1, 0(%[src_ptr]) \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "addiu $t9, $t9, -1 \n"
+ "sb $t1, 0(%[dst]) \n"
+ "bgtz $t9, 21b \n"
+ " addiu %[dst], %[dst], 1 \n"
+
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst)
+ : [dst_width] "r" (dst_width)
+ : "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ const uint8* s2 = s1 + stride;
+ const uint8* s3 = s2 + stride;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+
+ "srl $t9, %[dst_width], 1 \n"
+ "andi $t8, %[dst_width], 1 \n"
+
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 4(%[s1]) \n" // |23|22|21|20|
+ "lw $t6, 4(%[s2]) \n" // |27|26|25|24|
+ "lw $t7, 4(%[s3]) \n" // |31|30|29|28|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
+ "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
+ "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
+ "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "add $t4, $t4, $t5 \n"
+ "add $t6, $t6, $t7 \n"
+ "add $t4, $t4, $t6 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "shra_r.w $t4, $t4, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+ "sb $t4, 1(%[dst]) \n"
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[s3], %[s3], 8 \n"
+ "addiu $t9, $t9, -1 \n"
+ "bgtz $t9, 1b \n"
+ " addiu %[dst], %[dst], 2 \n"
+ "beqz $t8, 2f \n"
+ " nop \n"
+
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 0(%[s1]) \n" // |7|6|5|4|
+ "lw $t2, 0(%[s2]) \n" // |11|10|9|8|
+ "lw $t3, 0(%[s3]) \n" // |15|14|13|12|
+ "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
+ "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
+ "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
+ "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
+ "add $t0, $t0, $t1 \n"
+ "add $t1, $t2, $t3 \n"
+ "add $t0, $t0, $t1 \n"
+ "shra_r.w $t0, $t0, 4 \n"
+ "sb $t0, 0(%[dst]) \n"
+
+ "2: \n"
+ ".set pop \n"
+
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [s3] "+r" (s3)
+ : [dst_width] "r" (dst_width)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "1: \n"
+ "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
+ "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
+ "addiu %[dst_width], %[dst_width], -24 \n"
+ "ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
+ "ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
+ "ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
+ "ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
+ "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
+ "prepend $t1, $t2, 8 \n" // |4|3|1|0|
+ "prepend $t3, $t4, 24 \n" // |15|13|12|11|
+ "prepend $t5, $t6, 8 \n" // |20|19|17|16|
+ "prepend $t7, $t8, 24 \n" // |31|29|28|27|
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $t3, 8(%[dst]) \n"
+ "sw $t5, 12(%[dst]) \n"
+ "sw $t9, 16(%[dst]) \n"
+ "sw $t7, 20(%[dst]) \n"
+ "bnez %[dst_width], 1b \n"
+ " addiu %[dst], %[dst], 24 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6","t7", "t8", "t9"
+ );
+}
+
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t3, 3 \n" // 0x00030003
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
+ "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t2, $t2, $t4 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "sll $t5, $t0, 1 \n"
+ "add $t0, $t5, $t0 \n"
+ "shra_r.ph $t2, $t2, 2 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shll.ph $t4, $t2, 1 \n"
+ "addq.ph $t4, $t4, $t2 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.w $t0, $t0, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "srl $t1, $t6, 16 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "repl.ph $t2, 3 \n" // 0x00030003
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
+ "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
+ "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
+ "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
+ "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
+ "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
+ "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t1, $t1 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "shra_r.w $t1, $t1, 1 \n"
+ "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
+ "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
+ "rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
+ "rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
+ "addu.ph $t4, $t4, $t3 \n"
+ "addu.ph $t6, $t6, $t5 \n"
+ "shra_r.ph $t6, $t6, 2 \n"
+ "shra_r.ph $t4, $t4, 2 \n"
+ "addu.ph $t6, $t6, $t4 \n"
+ "addiu %[src_ptr], %[src_ptr], 4 \n"
+ "shra_r.ph $t6, $t6, 1 \n"
+ "addu $t0, $t0, $t1 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "shra_r.w $t0, $t0, 1 \n"
+ "srl $t1, $t6, 16 \n"
+ "sb $t1, 0(%[d]) \n"
+ "sb $t0, 1(%[d]) \n"
+ "sb $t6, 2(%[d]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " addiu %[d], %[d], 3 \n"
+ "3: \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [src_stride] "+r" (src_stride),
+ [d] "+r" (d),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3",
+ "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
+ "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
+ "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
+ "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
+ "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
+ "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
+ "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
+ "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
+ "wsbh $t0, $t0 \n" // |2|3|0|1|
+ "wsbh $t6, $t6 \n" // |26|27|24|25|
+ "srl $t0, $t0, 8 \n" // |X|2|3|0|
+ "srl $t3, $t3, 16 \n" // |X|X|15|14|
+ "srl $t5, $t5, 16 \n" // |X|X|23|22|
+ "srl $t7, $t7, 16 \n" // |X|X|31|30|
+ "ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
+ "ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
+ "ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
+ "ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
+ "prepend $t2, $t3, 24 \n" // |X|15|14|11|
+ "ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
+ "ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
+ "addiu %[src_ptr], %[src_ptr], 32 \n"
+ "addiu %[dst_width], %[dst_width], -12 \n"
+ "addiu $t8,%[dst_width], -12 \n"
+ "sw $t1, 0(%[dst]) \n"
+ "sw $t4, 4(%[dst]) \n"
+ "sw $t6, 8(%[dst]) \n"
+ "bgez $t8, 1b \n"
+ " addiu %[dst], %[dst], 12 \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst] "+r" (dst),
+ [dst_width] "+r" (dst_width)
+ :
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* t = src_ptr + stride;
+ const int c = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
+ "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
+ "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
+ "srl $t4, $t4, 2 \n" // t4 / 4
+ "srl $t6, $t6, 16 \n" // |0|0|S3|T3|
+ "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
+ "addu $t6, $t5, $t6 \n"
+ "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
+ "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
+ "addu $t0, $t0, $t2 \n"
+ "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[t], %[t], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t4, -1(%[dst_ptr]) \n"
+ "sb $t6, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [t] "+r" (t),
+ [dst_width] "+r" (dst_width)
+ : [c] "r" (c)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
+ );
+}
+
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stride = src_stride;
+ const uint8* s1 = src_ptr + stride;
+ stride += stride;
+ const uint8* s2 = src_ptr + stride;
+ const int c1 = 0x1C71;
+ const int c2 = 0x2AAA;
+
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "1: \n"
+ "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
+ "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
+ "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
+ "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
+ "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
+ "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
+ "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
+ "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
+ "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
+ "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
+ "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
+ "sll $t8, $t5, 16 \n" // |R5|R4|0|0|
+ "raddu.w.qb $t8, $t8 \n" // R5+R4
+ "addu $t7, $t7, $t8 \n"
+ "srl $t8, $t5, 16 \n" // |0|0|R7|R6|
+ "raddu.w.qb $t8, $t8 \n" // R7 + R6
+ "addu $t6, $t6, $t8 \n"
+ "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
+ "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
+ "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
+ "srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
+ "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
+ "addu $t7, $t7, $t8 \n"
+ "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
+ "sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
+ "sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
+ "sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
+ "raddu.w.qb $t0, $t0 \n"
+ "raddu.w.qb $t2, $t2 \n"
+ "raddu.w.qb $t4, $t4 \n"
+ "addu $t0, $t0, $t2 \n"
+ "addu $t0, $t0, $t4 \n"
+ "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
+ "addiu %[src_ptr], %[src_ptr], 8 \n"
+ "addiu %[s1], %[s1], 8 \n"
+ "addiu %[s2], %[s2], 8 \n"
+ "addiu %[dst_width], %[dst_width], -3 \n"
+ "addiu %[dst_ptr], %[dst_ptr], 3 \n"
+ "srl $t6, $t6, 16 \n"
+ "srl $t7, $t7, 16 \n"
+ "srl $t0, $t0, 16 \n"
+ "sb $t6, -1(%[dst_ptr]) \n"
+ "sb $t7, -2(%[dst_ptr]) \n"
+ "bgtz %[dst_width], 1b \n"
+ " sb $t0, -3(%[dst_ptr]) \n"
+ ".set pop \n"
+ : [src_ptr] "+r" (src_ptr),
+ [dst_ptr] "+r" (dst_ptr),
+ [s1] "+r" (s1),
+ [s2] "+r" (s2),
+ [dst_width] "+r" (dst_width)
+ : [c1] "r" (c1), [c2] "r" (c2)
+ : "t0", "t1", "t2", "t3", "t4",
+ "t5", "t6", "t7", "t8"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc
new file mode 100644
index 00000000000..a370349a72f
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_neon.cc
@@ -0,0 +1,554 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+// NEON downscalers with interpolation.
+// Provided by Fritz Koenig
+
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.8 {q0, q1}, [%0]! \n"
+ "subs %2, %2, #16 \n" // 16 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1" // Clobber List
+ );
+}
+
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
+ "vpaddl.u8 q1, q1 \n"
+ "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
+ "vpadal.u8 q1, q3 \n"
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "q0", "q1", "memory", "cc"
+ );
+}
+
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "add r4, %0, %3 \n"
+ "add r5, r4, %3 \n"
+ "add %3, r5, %3 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load up 16x4
+ "vld1.8 {q1}, [r4]! \n"
+ "vld1.8 {q2}, [r5]! \n"
+ "vld1.8 {q3}, [%3]! \n"
+ "subs %2, %2, #4 \n"
+ "vpaddl.u8 q0, q0 \n"
+ "vpadal.u8 q0, q1 \n"
+ "vpadal.u8 q0, q2 \n"
+ "vpadal.u8 q0, q3 \n"
+ "vpaddl.u16 q0, q0 \n"
+ "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
+ "vmovn.u16 d0, q0 \n"
+ "vst1.32 {d0[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stride) // %3
+ : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+ );
+}
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "d0", "d1", "d2", "d3", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+
+ // filter src line 0 with src line 1
+ // expand chars to shorts to allow for room
+ // when adding lines together
+ "vmovl.u8 q8, d4 \n"
+ "vmovl.u8 q9, d5 \n"
+ "vmovl.u8 q10, d6 \n"
+ "vmovl.u8 q11, d7 \n"
+
+ // 3 * line_0 + line_1
+ "vmlal.u8 q8, d0, d24 \n"
+ "vmlal.u8 q9, d1, d24 \n"
+ "vmlal.u8 q10, d2, d24 \n"
+ "vmlal.u8 q11, d3, d24 \n"
+
+ // (3 * line_0 + line_1) >> 2
+ "vqrshrn.u16 d0, q8, #2 \n"
+ "vqrshrn.u16 d1, q9, #2 \n"
+ "vqrshrn.u16 d2, q10, #2 \n"
+ "vqrshrn.u16 d3, q11, #2 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q8, d1 \n"
+ "vmlal.u8 q8, d0, d24 \n"
+ "vqrshrn.u16 d0, q8, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q8, d2 \n"
+ "vmlal.u8 q8, d3, d24 \n"
+ "vqrshrn.u16 d2, q8, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
+ );
+}
+
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
+ "subs %2, %2, #24 \n"
+ // average src line 0 with src line 1
+ "vrhadd.u8 q0, q0, q2 \n"
+ "vrhadd.u8 q1, q1, q3 \n"
+
+ // a0 = (src[0] * 3 + s[1] * 1) >> 2
+ "vmovl.u8 q3, d1 \n"
+ "vmlal.u8 q3, d0, d24 \n"
+ "vqrshrn.u16 d0, q3, #2 \n"
+
+ // a1 = (src[1] * 1 + s[2] * 1) >> 1
+ "vrhadd.u8 d1, d1, d2 \n"
+
+ // a2 = (src[2] * 1 + s[3] * 3) >> 2
+ "vmovl.u8 q3, d2 \n"
+ "vmlal.u8 q3, d3, d24 \n"
+ "vqrshrn.u16 d2, q3, #2 \n"
+
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ :
+ : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
+ );
+}
+
+#define HAS_SCALEROWDOWN38_NEON
+const uvec8 kShuf38 =
+ { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
+const uvec8 kShuf38_2 =
+ { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
+const vec16 kMult38_Div6 =
+ { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
+ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
+const vec16 kMult38_Div9 =
+ { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
+ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.8 {q3}, [%3] \n"
+ "1: \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(&kShuf38) // %3
+ : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
+ );
+}
+
+// 32x3 -> 12x1
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "vld1.8 {q15}, [%6] \n"
+ "add r4, %0, %3, lsl #1 \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "vld4.8 {d16, d17, d18, d19}, [r4]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+ "vtrn.u8 d16, d17 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+ "vtrn.u8 d18, d19 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+ "vpaddl.u8 q8, q8 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+ "vpaddl.u8 d19, d19 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 q0, q8 \n"
+ "vadd.u16 d4, d3, d7 \n"
+ "vadd.u16 d4, d19 \n"
+
+ // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+ // + s[6 + st * 1] + s[7 + st * 1]
+ // + s[6 + st * 2] + s[7 + st * 2]) / 6
+ "vqrdmulh.s16 q2, q2, q13 \n"
+ "vmovn.u16 d4, q2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+ "vmovl.u8 q9, d18 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+ "vadd.u16 q1, q9 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q15 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2), // %5
+ "r"(&kMult38_Div9) // %6
+ : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
+ "q13", "q14", "q15", "memory", "cc"
+ );
+}
+
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ "1: \n"
+
+ // d0 = 00 40 01 41 02 42 03 43
+ // d1 = 10 50 11 51 12 52 13 53
+ // d2 = 20 60 21 61 22 62 23 63
+ // d3 = 30 70 31 71 32 72 33 73
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
+ "subs %2, %2, #12 \n"
+
+ // Shuffle the input data around to get align the data
+ // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+ // d0 = 00 10 01 11 02 12 03 13
+ // d1 = 40 50 41 51 42 52 43 53
+ "vtrn.u8 d0, d1 \n"
+ "vtrn.u8 d4, d5 \n"
+
+ // d2 = 20 30 21 31 22 32 23 33
+ // d3 = 60 70 61 71 62 72 63 73
+ "vtrn.u8 d2, d3 \n"
+ "vtrn.u8 d6, d7 \n"
+
+ // d0 = 00+10 01+11 02+12 03+13
+ // d2 = 40+50 41+51 42+52 43+53
+ "vpaddl.u8 q0, q0 \n"
+ "vpaddl.u8 q2, q2 \n"
+
+ // d3 = 60+70 61+71 62+72 63+73
+ "vpaddl.u8 d3, d3 \n"
+ "vpaddl.u8 d7, d7 \n"
+
+ // combine source lines
+ "vadd.u16 q0, q2 \n"
+ "vadd.u16 d4, d3, d7 \n"
+
+ // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+ "vqrshrn.u16 d4, q2, #2 \n"
+
+ // Shuffle 2,3 reg around so that 2 can be added to the
+ // 0,1 reg and 3 can be added to the 4,5 reg. This
+ // requires expanding from u8 to u16 as the 0,1 and 4,5
+ // registers are already expanded. Then do transposes
+ // to get aligned.
+ // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q3, d6 \n"
+
+ // combine source lines
+ "vadd.u16 q1, q3 \n"
+
+ // d4 = xx 20 xx 30 xx 22 xx 32
+ // d5 = xx 21 xx 31 xx 23 xx 33
+ "vtrn.u32 d2, d3 \n"
+
+ // d4 = xx 20 xx 21 xx 22 xx 23
+ // d5 = xx 30 xx 31 xx 32 xx 33
+ "vtrn.u16 d2, d3 \n"
+
+ // 0+1+2, 3+4+5
+ "vadd.u16 q0, q1 \n"
+
+ // Need to divide, but can't downshift as the the value
+ // isn't a power of 2. So multiply by 65536 / n
+ // and take the upper 16 bits.
+ "vqrdmulh.s16 q0, q0, q13 \n"
+
+ // Align for table lookup, vtbl requires registers to
+ // be adjacent
+ "vmov.u8 d2, d4 \n"
+
+ "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
+ "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+
+ "vst1.8 {d3}, [%1]! \n"
+ "vst1.32 {d4[0]}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_stride) // %3
+ : "r"(&kMult38_Div6), // %4
+ "r"(&kShuf38_2) // %5
+ : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
+ );
+}
+
+// 16x2 -> 16x1
+void ScaleFilterRows_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ "vst1.8 {d1[7]}, [%0] \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
+ );
+}
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/video_common.cc b/chromium/third_party/libyuv/source/video_common.cc
new file mode 100644
index 00000000000..8294b913ede
--- /dev/null
+++ b/chromium/third_party/libyuv/source/video_common.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (static_cast<int>((sizeof(x) / sizeof(x[0]))))
+
+struct FourCCAliasEntry {
+ uint32 alias;
+ uint32 canonical;
+};
+
+static const FourCCAliasEntry kFourCCAliases[] = {
+ {FOURCC_IYUV, FOURCC_I420},
+ {FOURCC_YU16, FOURCC_I422},
+ {FOURCC_YU24, FOURCC_I444},
+ {FOURCC_YUYV, FOURCC_YUY2},
+ {FOURCC_YUVS, FOURCC_YUY2},
+ {FOURCC_HDYC, FOURCC_UYVY},
+ {FOURCC_2VUY, FOURCC_UYVY},
+ {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
+ {FOURCC_DMB1, FOURCC_MJPG},
+ {FOURCC_BA81, FOURCC_BGGR},
+ {FOURCC_RGB3, FOURCC_RAW},
+ {FOURCC_BGR3, FOURCC_24BG},
+ {FOURCC_CM32, FOURCC_BGRA},
+ {FOURCC_CM24, FOURCC_RAW},
+};
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+ for (int i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+ if (kFourCCAliases[i].alias == fourcc) {
+ return kFourCCAliases[i].canonical;
+ }
+ }
+ // Not an alias, so return it as-is.
+ return fourcc;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/chromium/third_party/libyuv/source/x86inc.asm b/chromium/third_party/libyuv/source/x86inc.asm
new file mode 100644
index 00000000000..cb5c32df3ad
--- /dev/null
+++ b/chromium/third_party/libyuv/source/x86inc.asm
@@ -0,0 +1,1136 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Henrik Gramner <hengar-6@student.ltu.se>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible. Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well. Send patches or ideas
+; to x264-devel@videolan.org .
+
+; Local changes for libyuv:
+; remove %define program_name and references in labels
+; rename cpus to uppercase
+
+%define WIN64 0
+%define UNIX64 0
+%if ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,win64
+ %define WIN64 1
+ %else
+ %define UNIX64 1
+ %endif
+%endif
+
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0-1 16
+ %ifidn __OUTPUT_FORMAT__,macho64
+ SECTION .text align=%1
+ %elifidn __OUTPUT_FORMAT__,macho
+ SECTION .text align=%1
+ fakegot:
+ %elifidn __OUTPUT_FORMAT__,aout
+ section .text
+ %else
+ SECTION .rodata align=%1
+ %endif
+%endmacro
+
+; aout does not support align=
+%macro SECTION_TEXT 0-1 16
+ %ifidn __OUTPUT_FORMAT__,aout
+ SECTION .text
+ %else
+ SECTION .text align=%1
+ %endif
+%endmacro
+
+%if WIN64
+ %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+ %undef PIC
+%endif
+%ifdef PIC
+ default rel
+%endif
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+ %define r%1q %2
+ %define r%1d %2d
+ %define r%1w %2w
+ %define r%1b %2b
+ %define r%1h %2h
+ %if %0 == 2
+ %define r%1m %2d
+ %define r%1mp %2
+ %elif ARCH_X86_64 ; memory
+ %define r%1m [rsp + stack_offset + %3]
+ %define r%1mp qword r %+ %1m
+ %else
+ %define r%1m [esp + stack_offset + %3]
+ %define r%1mp dword r %+ %1m
+ %endif
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1h %3
+ %define e%1h %3
+ %define r%1b %2
+ %define e%1b %2
+%if ARCH_X86_64 == 0
+ %define r%1 e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1h t%1 %+ h
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro PUSH 1
+ push %1
+ %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+ pop %1
+ %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assert failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, h
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
+ CAT_UNDEF arg_name %+ %%i, mp
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %xdefine %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1h r %+ %%i %+ h
+ %xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
+ %xdefine %1mp r %+ %%i %+ mp
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %xdefine stack_offset %%stack_offset
+ %assign n_arg_names %0
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx
+DECLARE_REG 1, rdx
+DECLARE_REG 2, R8
+DECLARE_REG 3, R9
+DECLARE_REG 4, R10, 40
+DECLARE_REG 5, R11, 48
+DECLARE_REG 6, rax, 56
+DECLARE_REG 7, rdi, 64
+DECLARE_REG 8, rsi, 72
+DECLARE_REG 9, rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+ %if mmsize == 8
+ %assign xmm_regs_used 0
+ %else
+ WIN64_SPILL_XMM %3
+ %endif
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16
+ %if xmm_regs_used > 6
+ SUB rsp, (xmm_regs_used-6)*16+16
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
+ %endrep
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+ %if xmm_regs_used > 6
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+ %endrep
+ add %1, (xmm_regs_used-6)*16+16
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+ WIN64_RESTORE_XMM_INTERNAL %1
+ %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+ %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+
+%macro RET 0
+ WIN64_RESTORE_XMM_INTERNAL rsp
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0, rdi
+DECLARE_REG 1, rsi
+DECLARE_REG 2, rdx
+DECLARE_REG 3, rcx
+DECLARE_REG 4, R8
+DECLARE_REG 5, R9
+DECLARE_REG 6, rax, 8
+DECLARE_REG 7, R10, 16
+DECLARE_REG 8, R11, 24
+DECLARE_REG 9, rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32
+
+%macro RET 0
+ POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [esp + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ ASSERT regs_used >= num_args
+ PUSH_IF_USED 3, 4, 5, 6
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+ DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32
+
+%macro RET 0
+ POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
+%macro REP_RET 0
+ %if has_epilogue
+ RET
+ %else
+ rep ret
+ %endif
+%endmacro
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+ %if has_epilogue
+ call %1
+ RET
+ %elif %2
+ jmp %1
+ %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+ cglobal_internal %1 %+ SUFFIX
+%else
+ cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+ %ifndef cglobaled_%1
+ %xdefine %1 mangle(%1)
+ %xdefine %1.skip_prologue %1 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %1, 1
+ %endif
+ %xdefine current_function %1
+ %ifidn __OUTPUT_FORMAT__,elf
+ global %1:function hidden
+ %else
+ global %1
+ %endif
+ align function_align
+ %1:
+ RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+ %assign stack_offset 0
+ %if %0 > 1
+ PROLOGUE %2
+ %endif
+%endmacro
+
+%macro cextern 1
+ %xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+ %xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+%macro const 2+
+ %xdefine %1 mangle(%1)
+ global %1
+ %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_MMX (1<<0)
+%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX
+%assign cpuflags_3dnow (1<<2) | cpuflags_MMX
+%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow
+%assign cpuflags_SSE (1<<4) | cpuflags_MMX2
+%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE
+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
+%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2
+%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3
+%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3
+%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4
+%assign cpuflags_AVX (1<<11)| cpuflags_SSE42
+%assign cpuflags_xop (1<<12)| cpuflags_AVX
+%assign cpuflags_fma4 (1<<13)| cpuflags_AVX
+%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX
+%assign cpuflags_fma3 (1<<15)| cpuflags_AVX
+
+%assign cpuflags_cache32 (1<<16)
+%assign cpuflags_cache64 (1<<17)
+%assign cpuflags_slowctz (1<<18)
+%assign cpuflags_lzcnt (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<22)
+%assign cpuflags_bmi1 (1<<23)
+%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
+%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
+
+%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+ %if %0 >= 1
+ %xdefine cpuname %1
+ %assign cpuflags cpuflags_%1
+ %if %0 >= 2
+ %xdefine cpuname %1_%2
+ %assign cpuflags cpuflags | cpuflags_%2
+ %endif
+ %xdefine SUFFIX _ %+ cpuname
+ %if cpuflag(AVX)
+ %assign AVX_enabled 1
+ %endif
+ %if mmsize == 16 && notcpuflag(SSE2)
+ %define mova movaps
+ %define movu movups
+ %define movnta movntps
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elifidn %1, SSE3
+ %define movu lddqu
+ %endif
+ %else
+ %xdefine SUFFIX
+ %undef cpuname
+ %undef cpuflags
+ %endif
+%endmacro
+
+; merge MMX and SSE*
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+ %assign AVX_enabled 0
+ %define RESET_MM_PERMUTATION INIT_MMX %1
+ %define mmsize 8
+ %define num_mmregs 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnta movntq
+ %assign %%i 0
+ %rep 8
+ CAT_XDEFINE m, %%i, mm %+ %%i
+ CAT_XDEFINE nmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %rep 8
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nmm, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+ %assign AVX_enabled 0
+ %define RESET_MM_PERMUTATION INIT_XMM %1
+ %define mmsize 16
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, xmm %+ %%i
+ CAT_XDEFINE nxmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+ %assign AVX_enabled 1
+ %define RESET_MM_PERMUTATION INIT_YMM %1
+ %define mmsize 32
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova vmovaps
+ %define movu vmovups
+ %undef movh
+ %define movnta vmovntps
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, ymm %+ %%i
+ CAT_XDEFINE nymm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+ %xdefine tmp%2 m%2
+ %xdefine ntmp%2 nm%2
+ %rotate 2
+%endrep
+%rep %0/2
+ %xdefine m%1 tmp%2
+ %xdefine nm%1 ntmp%2
+ %undef tmp%2
+ %undef ntmp%2
+ %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+ %xdefine tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 tmp
+ CAT_XDEFINE n, m%1, %1
+ CAT_XDEFINE n, m%2, %2
+%else
+ ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+ ; Be careful using this mode in nested macros though, as in some cases there may be
+ ; other copies of m# that have already been dereferenced and don't get updated correctly.
+ %xdefine %%n1 n %+ %1
+ %xdefine %%n2 n %+ %2
+ %xdefine tmp m %+ %%n1
+ CAT_XDEFINE m, %%n1, m %+ %%n2
+ CAT_XDEFINE m, %%n2, tmp
+ CAT_XDEFINE n, m %+ %%n1, %%n1
+ CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+ %undef tmp
+ %rotate 1
+%endrep
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE %%f, %%i, m %+ %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+ %ifdef %1_m0
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1_m %+ %%i
+ CAT_XDEFINE n, m %+ %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+ call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %1
+ %ifndef cglobaled_%1
+ %ifdef cglobaled_%2
+ %xdefine %%i %2
+ %endif
+ %endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-AVX emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+ %ifid %6
+ %define %%sizeofreg sizeof%6
+ %elifid %5
+ %define %%sizeofreg sizeof%5
+ %else
+ %define %%sizeofreg mmsize
+ %endif
+ %if %%sizeofreg==32
+ %if %4>=3
+ v%1 %5, %6, %7
+ %else
+ v%1 %5, %6
+ %endif
+ %else
+ %if %%sizeofreg==8
+ %define %%regmov movq
+ %elif %2
+ %define %%regmov movaps
+ %else
+ %define %%regmov movdqa
+ %endif
+
+ %if %4>=3+%3
+ %ifnidn %5, %6
+ %if AVX_enabled && %%sizeofreg==16
+ v%1 %5, %6, %7
+ %else
+ CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+ %%regmov %5, %6
+ %1 %5, %7
+ %endif
+ %else
+ %1 %5, %7
+ %endif
+ %elif %4>=3
+ %1 %5, %6, %7
+ %else
+ %1 %5, %6
+ %endif
+ %endif
+%endmacro
+
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+ %assign %%swap 0
+ %if AVX_enabled
+ %ifnid %6
+ %assign %%swap 1
+ %endif
+ %elifnidn %5, %6
+ %ifnid %7
+ %assign %%swap 1
+ %endif
+ %endif
+ %if %%swap && %3 == 0 && %8 == 1
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+ %else
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+ %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
+ %ifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cvtdq2ps, 1, 0, 0
+AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %else
+ %6 %1, %2, %3
+ %7 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.bat b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.bat
new file mode 100644
index 00000000000..e37f09eb25f
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.bat
@@ -0,0 +1,79 @@
+@echo off
+:: Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+::
+:: Use of this source code is governed by a BSD-style license
+:: that can be found in the LICENSE file in the root of the source
+:: tree. An additional intellectual property rights grant can be found
+:: in the file PATENTS. All contributing project authors may
+:: be found in the AUTHORS file in the root of the source tree.
+
+:: This script is a copy of chrome_tests.bat with the following changes:
+:: - Invokes libyuv_tests.py instead of chrome_tests.py
+:: - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make
+:: it possible to execute the Python scripts properly.
+
+:: TODO(timurrrr): batch files 'export' all the variables to the parent shell
+set THISDIR=%~dp0
+set TOOL_NAME="unknown"
+
+:: Get the tool name and put it into TOOL_NAME {{{1
+:: NB: SHIFT command doesn't modify %*
+:PARSE_ARGS_LOOP
+ if %1 == () GOTO:TOOLNAME_NOT_FOUND
+ if %1 == --tool GOTO:TOOLNAME_FOUND
+ SHIFT
+ goto :PARSE_ARGS_LOOP
+
+:TOOLNAME_NOT_FOUND
+echo "Please specify a tool (tsan or drmemory) by using --tool flag"
+exit /B 1
+
+:TOOLNAME_FOUND
+SHIFT
+set TOOL_NAME=%1
+:: }}}
+if "%TOOL_NAME%" == "drmemory" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_light" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_full" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "drmemory_pattern" GOTO :SETUP_DRMEMORY
+if "%TOOL_NAME%" == "tsan" GOTO :SETUP_TSAN
+echo "Unknown tool: `%TOOL_NAME%`! Only tsan and drmemory are supported."
+exit /B 1
+
+:SETUP_DRMEMORY
+if NOT "%DRMEMORY_COMMAND%"=="" GOTO :RUN_TESTS
+:: Set up DRMEMORY_COMMAND to invoke Dr. Memory {{{1
+set DRMEMORY_PATH=%THISDIR%..\..\third_party\drmemory
+set DRMEMORY_SFX=%DRMEMORY_PATH%\drmemory-windows-sfx.exe
+if EXIST %DRMEMORY_SFX% GOTO DRMEMORY_BINARY_OK
+echo "Can't find Dr. Memory executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:DRMEMORY_BINARY_OK
+%DRMEMORY_SFX% -o%DRMEMORY_PATH%\unpacked -y
+set DRMEMORY_COMMAND=%DRMEMORY_PATH%\unpacked\bin\drmemory.exe
+:: }}}
+goto :RUN_TESTS
+
+:SETUP_TSAN
+:: Set up PIN_COMMAND to invoke TSan {{{1
+set TSAN_PATH=%THISDIR%..\..\third_party\tsan
+set TSAN_SFX=%TSAN_PATH%\tsan-x86-windows-sfx.exe
+if EXIST %TSAN_SFX% GOTO TSAN_BINARY_OK
+echo "Can't find ThreadSanitizer executables."
+echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+echo "for the instructions on how to get them."
+exit /B 1
+
+:TSAN_BINARY_OK
+%TSAN_SFX% -o%TSAN_PATH%\unpacked -y
+set PIN_COMMAND=%TSAN_PATH%\unpacked\tsan-x86-windows\tsan.bat
+:: }}}
+goto :RUN_TESTS
+
+:RUN_TESTS
+set PYTHONPATH=%THISDIR%..\python\google;%THISDIR%..\valgrind
+set RUNNING_ON_VALGRIND=yes
+python %THISDIR%libyuv_tests.py %*
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
new file mode 100755
index 00000000000..f93e97bb71f
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""Runs various libyuv tests through valgrind_test.py.
+
+This script inherits the chrome_tests.py in Chrome, but allows running any test
+instead of only the hard-coded ones. It uses the -t cmdline flag to do this, and
+only supports specifying a single test for each run.
+
+Suppression files:
+The Chrome valgrind directory we use as a DEPS dependency contains the following
+suppression files:
+ valgrind/memcheck/suppressions.txt
+ valgrind/memcheck/suppressions_mac.txt
+ valgrind/tsan/suppressions.txt
+ valgrind/tsan/suppressions_mac.txt
+ valgrind/tsan/suppressions_win32.txt
+Since they're referenced from the chrome_tests.py script, we have similar files
+below the directory of this script. When executing, this script will setup both
+Chrome's suppression files and our own, so we can easily maintain libyuv
+specific suppressions in our own files.
+"""
+
+import logging
+import optparse
+import os
+import sys
+
+import logging_utils
+import path_utils
+
+import chrome_tests
+
+
+class LibyuvTest(chrome_tests.ChromeTests):
+ """Class that handles setup of suppressions for libyuv.
+
+ Everything else is inherited from chrome_tests.ChromeTests.
+ """
+
+ def _DefaultCommand(self, tool, exe=None, valgrind_test_args=None):
+ """Override command-building method so we can add more suppressions."""
+ cmd = chrome_tests.ChromeTests._DefaultCommand(self, tool, exe,
+ valgrind_test_args)
+ # When ChromeTests._DefaultCommand has executed, it has setup suppression
+ # files based on what's found in the memcheck/ or tsan/ subdirectories of
+ # this script's location. If Mac or Windows is executing, additional
+ # platform specific files have also been added.
+ # Since only the ones located below this directory is added, we must also
+ # add the ones maintained by Chrome, located in ../valgrind.
+
+ # The idea is to look for --suppression arguments in the cmd list and add a
+ # modified copy of each suppression file, for the corresponding file in
+ # ../valgrind. If we would simply replace 'valgrind-libyuv' with 'valgrind'
+ # we may produce invalid paths if other parts of the path contain that
+ # string. That's why the code below only replaces the end of the path.
+ script_dir = path_utils.ScriptDir()
+ old_base, _ = os.path.split(script_dir)
+ new_dir = os.path.join(old_base, 'valgrind')
+ add_suppressions = []
+ for token in cmd:
+ if '--suppressions' in token:
+ add_suppressions.append(token.replace(script_dir, new_dir))
+ return add_suppressions + cmd
+
+
+def main(_):
+ parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>')
+ parser.disable_interspersed_args()
+ parser.add_option('-b', '--build_dir',
+ help=('Location of the compiler output. Can only be used '
+ 'when the test argument does not contain this path.'))
+ parser.add_option('-t', '--test', help='Test to run.')
+ parser.add_option('', '--baseline', action='store_true', default=False,
+ help='Generate baseline data instead of validating')
+ parser.add_option('', '--gtest_filter',
+ help='Additional arguments to --gtest_filter')
+ parser.add_option('', '--gtest_repeat',
+ help='Argument for --gtest_repeat')
+ parser.add_option('-v', '--verbose', action='store_true', default=False,
+ help='Verbose output - enable debug log messages')
+ parser.add_option('', '--tool', dest='valgrind_tool', default='memcheck',
+ help='Specify a valgrind tool to run the tests under')
+ parser.add_option('', '--tool_flags', dest='valgrind_tool_flags', default='',
+ help='Specify custom flags for the selected valgrind tool')
+ parser.add_option('', '--keep_logs', action='store_true', default=False,
+ help=('Store memory tool logs in the <tool>.logs directory '
+ 'instead of /tmp.\nThis can be useful for tool '
+ 'developers/maintainers.\nPlease note that the <tool>'
+ '.logs directory will be clobbered on tool startup.'))
+ options, args = parser.parse_args()
+
+ if options.verbose:
+ logging_utils.config_root(logging.DEBUG)
+ else:
+ logging_utils.config_root()
+
+ if not options.test:
+ parser.error('--test not specified')
+
+ # If --build_dir is provided, prepend it to the test executable if needed.
+ test_executable = options.test
+ if options.build_dir and not test_executable.startswith(options.build_dir):
+ test_executable = os.path.join(options.build_dir, test_executable)
+ args = [test_executable] + args
+
+ test = LibyuvTest(options, args, 'cmdline')
+ return test.Run()
+
+if __name__ == '__main__':
+ return_code = main(sys.argv)
+ sys.exit(return_code)
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.sh b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.sh
new file mode 100755
index 00000000000..4fee7daed69
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# Set up some paths and re-direct the arguments to libyuv_tests.py
+
+# This script is a copy of the chrome_tests.sh wrapper script with the following
+# changes:
+# - The locate_valgrind.sh of Chromium's Valgrind scripts dir is used to locate
+# the Valgrind framework install.
+# - libyuv_tests.py is invoked instead of chrome_tests.py.
+# - Chromium's Valgrind scripts directory is added to the PYTHONPATH to make it
+# possible to execute the Python scripts properly.
+
+export THISDIR=`dirname $0`
+ARGV_COPY="$@"
+
+# We need to set CHROME_VALGRIND iff using Memcheck or TSan-Valgrind:
+# tools/valgrind-libyuv/libyuv_tests.sh --tool memcheck
+# or
+# tools/valgrind-libyuv/libyuv_tests.sh --tool=memcheck
+# (same for "--tool=tsan")
+tool="memcheck" # Default to memcheck.
+while (( "$#" ))
+do
+ if [[ "$1" == "--tool" ]]
+ then
+ tool="$2"
+ shift
+ elif [[ "$1" =~ --tool=(.*) ]]
+ then
+ tool="${BASH_REMATCH[1]}"
+ fi
+ shift
+done
+
+NEEDS_VALGRIND=0
+NEEDS_DRMEMORY=0
+
+case "$tool" in
+ "memcheck")
+ NEEDS_VALGRIND=1
+ ;;
+ "tsan" | "tsan_rv")
+ if [ "`uname -s`" == CYGWIN* ]
+ then
+ NEEDS_PIN=1
+ else
+ NEEDS_VALGRIND=1
+ fi
+ ;;
+ "drmemory" | "drmemory_light" | "drmemory_full" | "drmemory_pattern")
+ NEEDS_DRMEMORY=1
+ ;;
+esac
+
+# For Libyuv, we'll use the locate_valgrind.sh script in Chromium's Valgrind
+# scripts dir to locate the Valgrind framework install
+CHROME_VALGRIND_SCRIPTS=$THISDIR/../valgrind
+
+if [ "$NEEDS_VALGRIND" == "1" ]
+then
+ CHROME_VALGRIND=`sh $CHROME_VALGRIND_SCRIPTS/locate_valgrind.sh`
+ if [ "$CHROME_VALGRIND" = "" ]
+ then
+ # locate_valgrind.sh failed
+ exit 1
+ fi
+ echo "Using valgrind binaries from ${CHROME_VALGRIND}"
+
+ PATH="${CHROME_VALGRIND}/bin:$PATH"
+ # We need to set these variables to override default lib paths hard-coded into
+ # Valgrind binary.
+ export VALGRIND_LIB="$CHROME_VALGRIND/lib/valgrind"
+ export VALGRIND_LIB_INNER="$CHROME_VALGRIND/lib/valgrind"
+
+ # Clean up some /tmp directories that might be stale due to interrupted
+ # chrome_tests.py execution.
+ # FYI:
+ # -mtime +1 <- only print files modified more than 24h ago,
+ # -print0/-0 are needed to handle possible newlines in the filenames.
+ echo "Cleanup /tmp from Valgrind stuff"
+ find /tmp -maxdepth 1 \(\
+ -name "vgdb-pipe-*" -or -name "vg_logs_*" -or -name "valgrind.*" \
+ \) -mtime +1 -print0 | xargs -0 rm -rf
+fi
+
+if [ "$NEEDS_DRMEMORY" == "1" ]
+then
+ if [ -z "$DRMEMORY_COMMAND" ]
+ then
+ DRMEMORY_PATH="$THISDIR/../../third_party/drmemory"
+ DRMEMORY_SFX="$DRMEMORY_PATH/drmemory-windows-sfx.exe"
+ if [ ! -f "$DRMEMORY_SFX" ]
+ then
+ echo "Can't find Dr. Memory executables."
+ echo "See http://www.chromium.org/developers/how-tos/using-valgrind/dr-memory"
+ echo "for the instructions on how to get them."
+ exit 1
+ fi
+
+ chmod +x "$DRMEMORY_SFX" # Cygwin won't run it without +x.
+ "$DRMEMORY_SFX" -o"$DRMEMORY_PATH/unpacked" -y
+ export DRMEMORY_COMMAND="$DRMEMORY_PATH/unpacked/bin/drmemory.exe"
+ fi
+fi
+
+if [ "$NEEDS_PIN" == "1" ]
+then
+ if [ -z "$PIN_COMMAND" ]
+ then
+ # Set up PIN_COMMAND to invoke TSan.
+ TSAN_PATH="$THISDIR/../../third_party/tsan"
+ TSAN_SFX="$TSAN_PATH/tsan-x86-windows-sfx.exe"
+ echo "$TSAN_SFX"
+ if [ ! -f $TSAN_SFX ]
+ then
+ echo "Can't find ThreadSanitizer executables."
+ echo "See http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer/threadsanitizer-on-windows"
+ echo "for the instructions on how to get them."
+ exit 1
+ fi
+
+ chmod +x "$TSAN_SFX" # Cygwin won't run it without +x.
+ "$TSAN_SFX" -o"$TSAN_PATH"/unpacked -y
+ export PIN_COMMAND="$TSAN_PATH/unpacked/tsan-x86-windows/tsan.bat"
+ fi
+fi
+
+# Add Chrome's Valgrind scripts dir to the PYTHON_PATH since it contains
+# the scripts that are needed for this script to run
+PYTHONPATH=$THISDIR/../python/google:$CHROME_VALGRIND_SCRIPTS python \
+ "$THISDIR/libyuv_tests.py" $ARGV_COPY
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/OWNERS b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/OWNERS
new file mode 100644
index 00000000000..72e8ffc0db8
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/OWNERS
@@ -0,0 +1 @@
+*
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/PRESUBMIT.py b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/PRESUBMIT.py
new file mode 100644
index 00000000000..46ff4cfcf1c
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/PRESUBMIT.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+"""
+Copied from Chrome's src/tools/valgrind/memcheck/PRESUBMIT.py
+
+See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
+for more details on the presubmit API built into gcl.
+"""
+
+import os
+import re
+import sys
+
+def CheckChange(input_api, output_api):
+ """Checks the memcheck suppressions files for bad data."""
+
+ # Add the path to the Chrome valgrind dir to the import path:
+ tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..',
+ 'valgrind')
+ sys.path.append(tools_vg_path)
+ import suppressions
+
+ sup_regex = re.compile('suppressions.*\.txt$')
+ suppressions = {}
+ errors = []
+ check_for_memcheck = False
+ # skip_next_line has 3 possible values:
+ # - False: don't skip the next line.
+ # - 'skip_suppression_name': the next line is a suppression name, skip.
+ # - 'skip_param': the next line is a system call parameter error, skip.
+ skip_next_line = False
+ for f in filter(lambda x: sup_regex.search(x.LocalPath()),
+ input_api.AffectedFiles()):
+ for line, line_num in zip(f.NewContents(),
+ xrange(1, len(f.NewContents()) + 1)):
+ line = line.lstrip()
+ if line.startswith('#') or not line:
+ continue
+
+ if skip_next_line:
+ if skip_next_line == 'skip_suppression_name':
+ if 'insert_a_suppression_name_here' in line:
+ errors.append('"insert_a_suppression_name_here" is not a valid '
+ 'suppression name')
+ if suppressions.has_key(line):
+ if f.LocalPath() == suppressions[line][1]:
+ errors.append('suppression with name "%s" at %s line %s '
+ 'has already been defined at line %s' %
+ (line, f.LocalPath(), line_num,
+ suppressions[line][1]))
+ else:
+ errors.append('suppression with name "%s" at %s line %s '
+ 'has already been defined at %s line %s' %
+ (line, f.LocalPath(), line_num,
+ suppressions[line][0], suppressions[line][1]))
+ else:
+ suppressions[line] = (f, line_num)
+ check_for_memcheck = True;
+ skip_next_line = False
+ continue
+ if check_for_memcheck:
+ if not line.startswith('Memcheck:'):
+ errors.append('"%s" should be "Memcheck:..." in %s line %s' %
+ (line, f.LocalPath(), line_num))
+ check_for_memcheck = False;
+ if line == '{':
+ skip_next_line = 'skip_suppression_name'
+ continue
+ if line == "Memcheck:Param":
+ skip_next_line = 'skip_param'
+ continue
+
+ if (line.startswith('fun:') or line.startswith('obj:') or
+ line.startswith('Memcheck:') or line == '}' or
+ line == '...'):
+ continue
+ errors.append('"%s" is probably wrong: %s line %s' % (line, f.LocalPath(),
+ line_num))
+ if errors:
+ return [output_api.PresubmitError('\n'.join(errors))]
+ return []
+
+def CheckChangeOnUpload(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def CheckChangeOnCommit(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def GetPreferredTrySlaves():
+ # We don't have any memcheck slaves yet, so there's no use for this method.
+ # When we have, the slave name(s) should be put into this list.
+ return []
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions.txt b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions.txt
new file mode 100644
index 00000000000..3ad0c8ccc57
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_mac.txt b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_mac.txt
new file mode 100644
index 00000000000..3ad0c8ccc57
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_mac.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_win32.txt b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_win32.txt
new file mode 100644
index 00000000000..3ad0c8ccc57
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/memcheck/suppressions_win32.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/OWNERS b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/OWNERS
new file mode 100644
index 00000000000..72e8ffc0db8
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/OWNERS
@@ -0,0 +1 @@
+*
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/PRESUBMIT.py b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/PRESUBMIT.py
new file mode 100644
index 00000000000..d25b6ebcee5
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/PRESUBMIT.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+import os
+import re
+import sys
+
+"""
+Copied from Chrome's src/tools/valgrind/tsan/PRESUBMIT.py
+
+See http://dev.chromium.org/developers/how-tos/depottools/presubmit-scripts
+for more details on the presubmit API built into gcl.
+"""
+
+def CheckChange(input_api, output_api):
+ """Checks the TSan suppressions files for bad suppressions."""
+
+ # Add the path to the Chrome valgrind dir to the import path:
+ tools_vg_path = os.path.join(input_api.PresubmitLocalPath(), '..', '..',
+ 'valgrind')
+ sys.path.append(tools_vg_path)
+ import suppressions
+
+ return suppressions.PresubmitCheck(input_api, output_api)
+
+def CheckChangeOnUpload(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def CheckChangeOnCommit(input_api, output_api):
+ return CheckChange(input_api, output_api)
+
+def GetPreferredTrySlaves():
+ # We don't have any tsan slaves yet, so there's no use for this method.
+ # When we have, the slave name(s) should be put into this list.
+ return []
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions.txt b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions.txt
new file mode 100644
index 00000000000..3ad0c8ccc57
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_mac.txt b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_mac.txt
new file mode 100644
index 00000000000..3ad0c8ccc57
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_mac.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_win32.txt b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_win32.txt
new file mode 100644
index 00000000000..3ad0c8ccc57
--- /dev/null
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/tsan/suppressions_win32.txt
@@ -0,0 +1,5 @@
+# This file is used in addition to the one already maintained in Chrome.
+# It acts as a place holder for future additions for this project.
+# It must exist for the Python wrapper script to work properly.
+
+
diff --git a/chromium/third_party/libyuv/unit_test/basictypes_test.cc b/chromium/third_party/libyuv/unit_test/basictypes_test.cc
new file mode 100644
index 00000000000..a805dfa90d1
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/basictypes_test.cc
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+
+namespace libyuv {
+
+TEST_F(libyuvTest, Endian) {
+ uint16 v16 = 0x1234u;
+ uint8 first_byte = *reinterpret_cast<uint8*>(&v16);
+#if defined(LIBYUV_LITTLE_ENDIAN)
+ EXPECT_EQ(0x34u, first_byte);
+#else
+ EXPECT_EQ(0x12u, first_byte);
+#endif
+}
+
+TEST_F(libyuvTest, SizeOfTypes) {
+ int8 i8 = -1;
+ uint8 u8 = 1u;
+ int16 i16 = -1;
+ uint16 u16 = 1u;
+ int32 i32 = -1;
+ uint32 u32 = 1u;
+ int64 i64 = -1;
+ uint64 u64 = 1u;
+ EXPECT_EQ(1u, sizeof(i8));
+ EXPECT_EQ(1u, sizeof(u8));
+ EXPECT_EQ(2u, sizeof(i16));
+ EXPECT_EQ(2u, sizeof(u16));
+ EXPECT_EQ(4u, sizeof(i32));
+ EXPECT_EQ(4u, sizeof(u32));
+ EXPECT_EQ(8u, sizeof(i64));
+ EXPECT_EQ(8u, sizeof(u64));
+ EXPECT_GT(0, i8);
+ EXPECT_LT(0u, u8);
+ EXPECT_GT(0, i16);
+ EXPECT_LT(0u, u16);
+ EXPECT_GT(0, i32);
+ EXPECT_LT(0u, u32);
+ EXPECT_GT(0, i64);
+ EXPECT_LT(0u, u64);
+}
+
+TEST_F(libyuvTest, SizeOfConstants) {
+ EXPECT_EQ(8u, sizeof(INT64_C(0)));
+ EXPECT_EQ(8u, sizeof(UINT64_C(0)));
+ EXPECT_EQ(8u, sizeof(INT64_C(0x1234567887654321)));
+ EXPECT_EQ(8u, sizeof(UINT64_C(0x8765432112345678)));
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/compare_test.cc b/chromium/third_party/libyuv/unit_test/compare_test.cc
new file mode 100644
index 00000000000..7fe6c3b0b19
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/compare_test.cc
@@ -0,0 +1,437 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "../unit_test/unit_test.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/cpu_id.h"
+
+namespace libyuv {
+
+// hash seed of 5381 recommended.
+static uint32 ReferenceHashDjb2(const uint8* src, uint64 count, uint32 seed) {
+ uint32 hash = seed;
+ if (count > 0) {
+ do {
+ hash = hash * 33 + *src++;
+ } while (--count);
+ }
+ return hash;
+}
+
+TEST_F(libyuvTest, Djb2_Test) {
+ const int kMaxTest = benchmark_width_ * benchmark_height_;
+ align_buffer_64(src_a, kMaxTest)
+ align_buffer_64(src_b, kMaxTest)
+
+ const char* fox = "The quick brown fox jumps over the lazy dog"
+ " and feels as if he were in the seventh heaven of typography"
+ " together with Hermann Zapf";
+ uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
+ const uint32 kExpectedFoxHash = 2611006483;
+ EXPECT_EQ(kExpectedFoxHash, foxhash);
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = (random() & 0xff);
+ src_b[i] = (random() & 0xff);
+ }
+ // Compare different buffers. Expect hash is different.
+ uint32 h1 = HashDjb2(src_a, kMaxTest, 5381);
+ uint32 h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make last half same. Expect hash is different.
+ memcpy(src_a + kMaxTest / 2, src_b + kMaxTest / 2, kMaxTest / 2);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make first half same. Expect hash is different.
+ memcpy(src_a + kMaxTest / 2, src_a, kMaxTest / 2);
+ memcpy(src_b + kMaxTest / 2, src_b, kMaxTest / 2);
+ memcpy(src_a, src_b, kMaxTest / 2);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make same. Expect hash is same.
+ memcpy(src_a, src_b, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_EQ(h1, h2);
+
+ // Mask seed different. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 1234);
+ EXPECT_NE(h1, h2);
+
+ // Make one byte different in middle. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ ++src_b[kMaxTest / 2];
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make first byte different. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ ++src_b[0];
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make last byte different. Expect hash is different.
+ memcpy(src_a, src_b, kMaxTest);
+ ++src_b[kMaxTest - 1];
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_b, kMaxTest, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make a zeros. Test different lengths. Expect hash is different.
+ memset(src_a, 0, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ h2 = HashDjb2(src_a, kMaxTest / 2, 5381);
+ EXPECT_NE(h1, h2);
+
+ // Make a zeros and seed of zero. Test different lengths. Expect hash is same.
+ memset(src_a, 0, kMaxTest);
+ h1 = HashDjb2(src_a, kMaxTest, 0);
+ h2 = HashDjb2(src_a, kMaxTest / 2, 0);
+ EXPECT_EQ(h1, h2);
+
+ free_aligned_buffer_64(src_a)
+ free_aligned_buffer_64(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkDjb2_Opt) {
+ const int kMaxTest = benchmark_width_ * benchmark_height_;
+ align_buffer_64(src_a, kMaxTest)
+
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i] = i;
+ }
+ uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381);
+ uint32 h1;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = HashDjb2(src_a, kMaxTest, 5381);
+ }
+ EXPECT_EQ(h1, h2);
+ free_aligned_buffer_64(src_a)
+}
+
+TEST_F(libyuvTest, BenchmarkDjb2_Unaligned) {
+ const int kMaxTest = benchmark_width_ * benchmark_height_;
+ align_buffer_64(src_a, kMaxTest + 1)
+ for (int i = 0; i < kMaxTest; ++i) {
+ src_a[i + 1] = i;
+ }
+ uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381);
+ uint32 h1;
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ h1 = HashDjb2(src_a + 1, kMaxTest, 5381);
+ }
+ EXPECT_EQ(h1, h2);
+ free_aligned_buffer_64(src_a)
+}
+
+TEST_F(libyuvTest, BenchmarkSumSquareError_Opt) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_64(src_a, kMaxWidth)
+ align_buffer_64(src_b, kMaxWidth)
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ memcpy(src_a, "test0123test4567", 16);
+ memcpy(src_b, "tick0123tock4567", 16);
+ uint64 h1 = ComputeSumSquareError(src_a, src_b, 16);
+ EXPECT_EQ(790u, h1);
+
+ for (int i = 0; i < kMaxWidth; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ int count = benchmark_iterations_ *
+ ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth);
+ for (int i = 0; i < count; ++i) {
+ h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+ }
+
+ EXPECT_EQ(0, h1);
+
+ free_aligned_buffer_64(src_a)
+ free_aligned_buffer_64(src_b)
+}
+
+TEST_F(libyuvTest, SumSquareError) {
+ const int kMaxWidth = 4096 * 3;
+ align_buffer_64(src_a, kMaxWidth)
+ align_buffer_64(src_b, kMaxWidth)
+ memset(src_a, 0, kMaxWidth);
+ memset(src_b, 0, kMaxWidth);
+
+ uint64 err;
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(0, err);
+
+ memset(src_a, 1, kMaxWidth);
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(err, kMaxWidth);
+
+ memset(src_a, 190, kMaxWidth);
+ memset(src_b, 193, kMaxWidth);
+ err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(kMaxWidth * 3 * 3, err);
+
+ srandom(time(NULL));
+
+ for (int i = 0; i < kMaxWidth; ++i) {
+ src_a[i] = (random() & 0xff);
+ src_b[i] = (random() & 0xff);
+ }
+
+ MaskCpuFlags(0);
+ uint64 c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ MaskCpuFlags(-1);
+ uint64 opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth);
+
+ EXPECT_EQ(c_err, opt_err);
+
+ free_aligned_buffer_64(src_a)
+ free_aligned_buffer_64(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkPsnr_Opt) {
+ align_buffer_64(src_a, benchmark_width_ * benchmark_height_)
+ align_buffer_64(src_b, benchmark_width_ * benchmark_height_)
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(-1);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFramePsnr(src_a, benchmark_width_,
+ src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0);
+
+ free_aligned_buffer_64(src_a)
+ free_aligned_buffer_64(src_b)
+}
+
+TEST_F(libyuvTest, Psnr) {
+ const int kSrcWidth = benchmark_width_;
+ const int kSrcHeight = benchmark_height_;
+ const int b = 128;
+ const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+ const int kSrcStride = 2 * b + kSrcWidth;
+ align_buffer_64(src_a, kSrcPlaneSize)
+ align_buffer_64(src_b, kSrcPlaneSize)
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ double err;
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(err, kMaxPsnr);
+
+ memset(src_a, 255, kSrcPlaneSize);
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(err, 0.0);
+
+ memset(src_a, 1, kSrcPlaneSize);
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_GT(err, 48.0);
+ EXPECT_LT(err, 49.0);
+
+ for (int i = 0; i < kSrcPlaneSize; ++i) {
+ src_a[i] = i;
+ }
+
+ err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_GT(err, 4.0);
+ if (kSrcWidth * kSrcHeight >= 256) {
+ EXPECT_LT(err, 5.0);
+ }
+
+ srandom(time(NULL));
+
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ for (int i = b; i < (kSrcHeight + b); ++i) {
+ for (int j = b; j < (kSrcWidth + b); ++j) {
+ src_a[(i * kSrcStride) + j] = (random() & 0xff);
+ src_b[(i * kSrcStride) + j] = (random() & 0xff);
+ }
+ }
+
+ MaskCpuFlags(0);
+ double c_err, opt_err;
+
+ c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ MaskCpuFlags(-1);
+
+ opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ EXPECT_EQ(opt_err, c_err);
+
+ free_aligned_buffer_64(src_a)
+ free_aligned_buffer_64(src_b)
+}
+
+TEST_F(libyuvTest, BenchmarkSsim_Opt) {
+ align_buffer_64(src_a, benchmark_width_ * benchmark_height_)
+ align_buffer_64(src_b, benchmark_width_ * benchmark_height_)
+ for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+ src_a[i] = i;
+ src_b[i] = i;
+ }
+
+ MaskCpuFlags(-1);
+
+ double opt_time = get_time();
+ for (int i = 0; i < benchmark_iterations_; ++i)
+ CalcFrameSsim(src_a, benchmark_width_,
+ src_b, benchmark_width_,
+ benchmark_width_, benchmark_height_);
+
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6);
+
+ EXPECT_EQ(0, 0); // Pass if we get this far.
+
+ free_aligned_buffer_64(src_a)
+ free_aligned_buffer_64(src_b)
+}
+
+TEST_F(libyuvTest, Ssim) {
+ const int kSrcWidth = benchmark_width_;
+ const int kSrcHeight = benchmark_height_;
+ const int b = 128;
+ const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2);
+ const int kSrcStride = 2 * b + kSrcWidth;
+ align_buffer_64(src_a, kSrcPlaneSize)
+ align_buffer_64(src_b, kSrcPlaneSize)
+ memset(src_a, 0, kSrcPlaneSize);
+ memset(src_b, 0, kSrcPlaneSize);
+
+ if (kSrcWidth <=8 || kSrcHeight <= 8) {
+ printf("warning - Ssim size too small. Testing function executes.\n");
+ }
+
+ double err;
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_EQ(err, 1.0);
+ }
+
+ memset(src_a, 255, kSrcPlaneSize);
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_LT(err, 0.0001);
+ }
+
+ memset(src_a, 1, kSrcPlaneSize);
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_GT(err, 0.0001);
+ EXPECT_LT(err, 0.9);
+ }
+
+ for (int i = 0; i < kSrcPlaneSize; ++i) {
+ src_a[i] = i;
+ }
+
+ err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_GT(err, 0.0);
+ EXPECT_LT(err, 0.01);
+ }
+
+ srandom(time(NULL));
+ for (int i = b; i < (kSrcHeight + b); ++i) {
+ for (int j = b; j < (kSrcWidth + b); ++j) {
+ src_a[(i * kSrcStride) + j] = (random() & 0xff);
+ src_b[(i * kSrcStride) + j] = (random() & 0xff);
+ }
+ }
+
+ MaskCpuFlags(0);
+ double c_err, opt_err;
+
+ c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ MaskCpuFlags(-1);
+
+ opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride,
+ src_b + kSrcStride * b + b, kSrcStride,
+ kSrcWidth, kSrcHeight);
+
+ if (kSrcWidth > 8 && kSrcHeight > 8) {
+ EXPECT_EQ(opt_err, c_err);
+ }
+
+ free_aligned_buffer_64(src_a)
+ free_aligned_buffer_64(src_b)
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc
new file mode 100644
index 00000000000..7e96c63a4d5
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/convert_test.cc
@@ -0,0 +1,990 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_u, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(src_v, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_u_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_u_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ } \
+ MaskCpuFlags(0); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_u_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_u_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_v_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_v_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_u_c) \
+ free_aligned_buffer_64(dst_v_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_u_opt) \
+ free_aligned_buffer_64(dst_v_opt) \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_u) \
+ free_aligned_buffer_64(src_v) \
+}
+
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
+TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
+TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
+TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
+TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
+TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
+TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
+TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_u, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(src_v, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ } \
+ MaskCpuFlags(0); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_c, kWidth, \
+ dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_opt, kWidth, \
+ dst_uv_opt, \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_uv_c[i * \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_uv_opt[i * \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_uv_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_uv_opt) \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_u) \
+ free_aligned_buffer_64(src_v) \
+}
+
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
+TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_u_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_u_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ } \
+ MaskCpuFlags(0); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_u_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_u_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_v_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_v_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_u_c) \
+ free_aligned_buffer_64(dst_v_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_u_opt) \
+ free_aligned_buffer_64(dst_v_opt) \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_uv) \
+}
+
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
+TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = ((kWidth * BPP_B + ALIGN - 1) / ALIGN) * ALIGN; \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_u, kSizeUV + OFF); \
+ align_buffer_64(src_v, kSizeUV + OFF); \
+ align_buffer_64(dst_argb_c, kStrideB * kHeight); \
+ align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
+ memset(dst_argb_c, 0, kStrideB * kHeight); \
+ memset(dst_argb_opt, 0, kStrideB * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (random() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (random() & 0xff); \
+ src_v[i + OFF] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_argb_c, kStrideB, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_argb_opt, kStrideB, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_64(dst_argb32_c, kWidth * BPP_C * kHeight); \
+ align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \
+ memset(dst_argb32_c, 0, kWidth * BPP_C * kHeight); \
+ memset(dst_argb32_opt, 0, kWidth * BPP_C * kHeight); \
+ FMT_B##To##FMT_C(dst_argb_c, kStrideB, \
+ dst_argb32_c, kWidth * BPP_C , \
+ kWidth, kHeight); \
+ FMT_B##To##FMT_C(dst_argb_opt, kStrideB, \
+ dst_argb32_opt, kWidth * BPP_C , \
+ kWidth, kHeight); \
+ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb32_c[i]) - \
+ static_cast<int>(dst_argb32_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_u) \
+ free_aligned_buffer_64(src_v) \
+ free_aligned_buffer_64(dst_argb_c) \
+ free_aligned_buffer_64(dst_argb_opt) \
+ free_aligned_buffer_64(dst_argb32_c) \
+ free_aligned_buffer_64(dst_argb32_opt) \
+}
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ DIFF, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 17, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 0, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 1, 2, ARGB, 4)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ W1280, DIFF, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = kWidth * BPP_B; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_uv, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+ align_buffer_64(dst_argb_c, kStrideB * kHeight); \
+ align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
+ src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ dst_argb_c, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ dst_argb_opt, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ } \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \
+ align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \
+ memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \
+ memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \
+ FMT_B##ToARGB(dst_argb_c, kStrideB, \
+ dst_argb32_c, kWidth * 4, \
+ kWidth, kHeight); \
+ FMT_B##ToARGB(dst_argb_opt, kStrideB, \
+ dst_argb32_opt, kWidth * 4, \
+ kWidth, kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth * 4; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
+ static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_uv) \
+ free_aligned_buffer_64(dst_argb_c) \
+ free_aligned_buffer_64(dst_argb_opt) \
+ free_aligned_buffer_64(dst_argb32_c) \
+ free_aligned_buffer_64(dst_argb32_opt) \
+}
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, DIFF, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
+ align_buffer_64(src_argb, kStride * kHeight + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_u_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_u_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_c, 1, kWidth * kHeight); \
+ memset(dst_u_c, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 2, kWidth * kHeight); \
+ memset(dst_u_opt, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_u_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_u_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_v_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_v_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_u_c) \
+ free_aligned_buffer_64(dst_v_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_u_opt) \
+ free_aligned_buffer_64(dst_v_opt) \
+ free_aligned_buffer_64(src_argb) \
+}
+
+#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, DIFF) \
+ TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTATOPLANAR(ARGB, 4, I420, 2, 2, 4)
+#ifdef __arm__
+TESTATOPLANAR(ARGB, 4, J420, 2, 2, 4)
+#else
+TESTATOPLANAR(ARGB, 4, J420, 2, 2, 0)
+#endif
+TESTATOPLANAR(BGRA, 4, I420, 2, 2, 4)
+TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4)
+TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4)
+TESTATOPLANAR(RAW, 3, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, I420, 2, 2, 4)
+TESTATOPLANAR(RGB565, 2, I420, 2, 2, 5)
+// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
+TESTATOPLANAR(ARGB1555, 2, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, I420, 2, 2, 17)
+TESTATOPLANAR(ARGB, 4, I411, 4, 1, 4)
+TESTATOPLANAR(ARGB, 4, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, I444, 1, 1, 2)
+TESTATOPLANAR(YUY2, 2, I420, 2, 2, 2)
+TESTATOPLANAR(UYVY, 2, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, I422, 2, 1, 2)
+TESTATOPLANAR(UYVY, 2, I422, 2, 1, 2)
+TESTATOPLANAR(I400, 1, I420, 2, 2, 2)
+TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2, 4)
+TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2, 4)
+TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2, 4)
+TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2, 4)
+
+#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
+ align_buffer_64(src_argb, kStride * kHeight + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_uv_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_uv_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_c, kWidth, \
+ dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_opt, kWidth, \
+ dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_uv_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
+ static_cast<int>(dst_uv_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_uv_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_uv_opt) \
+ free_aligned_buffer_64(src_argb) \
+}
+
+#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
+ FMT_B, BPP_B, STRIDE_B, \
+ W1280, DIFF, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_64(src_argb, kStrideA * kHeight + OFF); \
+ align_buffer_64(dst_argb_c, kStrideB * kHeight); \
+ align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
+ memset(dst_argb_c, 0, kStrideB * kHeight); \
+ memset(dst_argb_opt, 0, kStrideB * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kStrideA * kHeight; ++i) { \
+ src_argb[i + OFF] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
+ dst_argb_c, kStrideB, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
+ dst_argb_opt, kStrideB, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeight; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(src_argb) \
+ free_aligned_buffer_64(dst_argb_c) \
+ free_aligned_buffer_64(dst_argb_opt) \
+}
+
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
+ srandom(time(NULL)); \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (random() & 63) + 1; \
+ const int kHeight = (random() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+ const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+ align_buffer_page_end(src_argb, kStrideA * kHeightA); \
+ align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
+ memset(dst_argb_c, 0, kStrideB * kHeightB); \
+ memset(dst_argb_opt, 0, kStrideB * kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, \
+ dst_argb_c, kStrideB, \
+ kWidth, kHeight); \
+ MaskCpuFlags(-1); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, \
+ dst_argb_opt, kStrideB, \
+ kWidth, kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb) \
+ free_aligned_buffer_page_end(dst_argb_c) \
+ free_aligned_buffer_page_end(dst_argb_opt) \
+ } \
+}
+
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
+ FMT_B, BPP_B, STRIDE_B, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
+ FMT_B, BPP_B, STRIDE_B, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
+ FMT_B, BPP_B, STRIDE_B, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
+ FMT_B, BPP_B, STRIDE_B, \
+ benchmark_width_, DIFF, _Opt, +, 0) \
+ TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+
+TEST_F(libyuvTest, Test565) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels[i][j] = i;
+ }
+ }
+ ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+ uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+ EXPECT_EQ(610919429u, checksum);
+}
+
+#ifdef HAVE_JPEG
+TEST_F(libyuvTest, ValidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+ benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_64(orig_pixels, kSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+
+ // EOI, SOI. Expect pass.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+ }
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(libyuvTest, InvalidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+ benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_64(orig_pixels, kSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // SOI but no EOI. Expect fail.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+ }
+ // EOI but no SOI. Expect fail.
+ orig_pixels[0] = 0;
+ orig_pixels[1] = 0;
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+#endif
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc
new file mode 100644
index 00000000000..67c489cfc93
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/version.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(libyuvTest, TestCpuHas) {
+ int cpu_flags = TestCpuFlag(-1);
+ printf("Cpu Flags %x\n", cpu_flags);
+ int has_arm = TestCpuFlag(kCpuHasARM);
+ printf("Has ARM %x\n", has_arm);
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ printf("Has NEON %x\n", has_neon);
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ printf("Has X86 %x\n", has_x86);
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ printf("Has SSE2 %x\n", has_sse2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ printf("Has SSSE3 %x\n", has_ssse3);
+ int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+ printf("Has SSE4.1 %x\n", has_sse41);
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ printf("Has SSE4.2 %x\n", has_sse42);
+ int has_avx = TestCpuFlag(kCpuHasAVX);
+ printf("Has AVX %x\n", has_avx);
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ printf("Has AVX2 %x\n", has_avx2);
+ int has_erms = TestCpuFlag(kCpuHasERMS);
+ printf("Has ERMS %x\n", has_erms);
+ int has_mips = TestCpuFlag(kCpuHasMIPS);
+ printf("Has MIPS %x\n", has_mips);
+ int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
+ printf("Has MIPS DSP %x\n", has_mips_dsp);
+ int has_mips_dspr2 = TestCpuFlag(kCpuHasMIPS_DSPR2);
+ printf("Has MIPS DSPR2 %x\n", has_mips_dspr2);
+}
+
+#if defined(__i386__) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_X64)
+TEST_F(libyuvTest, TestCpuId) {
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ if (has_x86) {
+ int cpu_info[4];
+ // Vendor ID:
+ // AuthenticAMD AMD processor
+ // CentaurHauls Centaur processor
+ // CyrixInstead Cyrix processor
+ // GenuineIntel Intel processor
+ // GenuineTMx86 Transmeta processor
+ // Geode by NSC National Semiconductor processor
+ // NexGenDriven NexGen processor
+ // RiseRiseRise Rise Technology processor
+ // SiS SiS SiS SiS processor
+ // UMC UMC UMC UMC processor
+ CpuId(cpu_info, 0);
+ cpu_info[0] = cpu_info[1]; // Reorder output
+ cpu_info[1] = cpu_info[3];
+ cpu_info[3] = 0;
+ printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast<char*>(&cpu_info[0]),
+ cpu_info[0], cpu_info[1], cpu_info[2]);
+ EXPECT_EQ(12, strlen(reinterpret_cast<char*>(&cpu_info[0])));
+
+ // CPU Family and Model
+ // 3:0 - Stepping
+ // 7:4 - Model
+ // 11:8 - Family
+ // 13:12 - Processor Type
+ // 19:16 - Extended Model
+ // 27:20 - Extended Family
+ CpuId(cpu_info, 1);
+ int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+ int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+ printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+ model, model);
+ }
+}
+#endif
+
+TEST_F(libyuvTest, TestLinuxNeon) {
+ int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
+ if (testdata) {
+ EXPECT_EQ(0,
+ ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
+ EXPECT_EQ(kCpuHasNEON,
+ ArmCpuCaps("unit_test/testdata/tegra3.txt"));
+ } else {
+ printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
+ }
+#if defined(__linux__) && defined(__ARM_NEON__)
+ EXPECT_NE(0, ArmCpuCaps("/proc/cpuinfo"));
+#endif
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc
new file mode 100644
index 00000000000..2c9958baae1
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/planar_test.cc
@@ -0,0 +1,1528 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/row.h" // For Sobel
+#include "../unit_test/unit_test.h"
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+
+namespace libyuv {
+
+TEST_F(libyuvTest, TestAttenuate) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 atten_pixels[256][4]);
+ SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
+ SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
+
+ // Test unattenuation clamps
+ orig_pixels[0][0] = 200u;
+ orig_pixels[0][1] = 129u;
+ orig_pixels[0][2] = 127u;
+ orig_pixels[0][3] = 128u;
+ // Test unattenuation transparent and opaque are unaffected
+ orig_pixels[1][0] = 16u;
+ orig_pixels[1][1] = 64u;
+ orig_pixels[1][2] = 192u;
+ orig_pixels[1][3] = 0u;
+ orig_pixels[2][0] = 16u;
+ orig_pixels[2][1] = 64u;
+ orig_pixels[2][2] = 192u;
+ orig_pixels[2][3] = 255u;
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 128u;
+ ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
+ EXPECT_EQ(255u, unatten_pixels[0][0]);
+ EXPECT_EQ(255u, unatten_pixels[0][1]);
+ EXPECT_EQ(254u, unatten_pixels[0][2]);
+ EXPECT_EQ(128u, unatten_pixels[0][3]);
+ EXPECT_EQ(0u, unatten_pixels[1][0]);
+ EXPECT_EQ(0u, unatten_pixels[1][1]);
+ EXPECT_EQ(0u, unatten_pixels[1][2]);
+ EXPECT_EQ(0u, unatten_pixels[1][3]);
+ EXPECT_EQ(16u, unatten_pixels[2][0]);
+ EXPECT_EQ(64u, unatten_pixels[2][1]);
+ EXPECT_EQ(192u, unatten_pixels[2][2]);
+ EXPECT_EQ(255u, unatten_pixels[2][3]);
+ EXPECT_EQ(32u, unatten_pixels[3][0]);
+ EXPECT_EQ(128u, unatten_pixels[3][1]);
+ EXPECT_EQ(255u, unatten_pixels[3][2]);
+ EXPECT_EQ(128u, unatten_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1);
+ ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1);
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1);
+ }
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2);
+ EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2);
+ EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2);
+ EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2);
+ }
+ // Make sure transparent, 50% and opaque are fully accurate.
+ EXPECT_EQ(0, atten_pixels[0][0]);
+ EXPECT_EQ(0, atten_pixels[0][1]);
+ EXPECT_EQ(0, atten_pixels[0][2]);
+ EXPECT_EQ(0, atten_pixels[0][3]);
+ EXPECT_EQ(64, atten_pixels[128][0]);
+ EXPECT_EQ(32, atten_pixels[128][1]);
+ EXPECT_EQ(21, atten_pixels[128][2]);
+ EXPECT_EQ(128, atten_pixels[128][3]);
+ EXPECT_NEAR(255, atten_pixels[255][0], 1);
+ EXPECT_NEAR(127, atten_pixels[255][1], 1);
+ EXPECT_NEAR(85, atten_pixels[255][2], 1);
+ EXPECT_EQ(255, atten_pixels[255][3]);
+}
+
+static int TestAttenuateI(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb[i + off] = (random() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBAttenuate(src_argb + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBAttenuate(src_argb + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Any) {
+ int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Unaligned) {
+ int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Invert) {
+ int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Opt) {
+ int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+static int TestUnattenuateI(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb[i + off] = (random() & 0xff);
+ }
+ ARGBAttenuate(src_argb + off, kStride,
+ src_argb + off, kStride,
+ width, height);
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBUnattenuate(src_argb + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBUnattenuate(src_argb + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBUnattenuate_Any) {
+ int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBUnattenuate_Unaligned) {
+ int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBUnattenuate_Invert) {
+ int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBUnattenuate_Opt) {
+ int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
+ SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
+ SIMD_ALIGNED(int32 added_pixels[16][16][4]);
+
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ orig_pixels[y][x][0] = 1u;
+ orig_pixels[y][x][1] = 2u;
+ orig_pixels[y][x][2] = 3u;
+ orig_pixels[y][x][3] = 255u;
+ }
+ }
+
+ ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4,
+ &added_pixels[0][0][0], 16 * 4,
+ 16, 16);
+
+ for (int y = 0; y < 16; ++y) {
+ for (int x = 0; x < 16; ++x) {
+ EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]);
+ EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]);
+ EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]);
+ EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]);
+ }
+ }
+}
+
+TEST_F(libyuvTest, TestARGBGray) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test black
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 255u;
+ // Test white
+ orig_pixels[4][0] = 255u;
+ orig_pixels[4][1] = 255u;
+ orig_pixels[4][2] = 255u;
+ orig_pixels[4][3] = 255u;
+ // Test color
+ orig_pixels[5][0] = 16u;
+ orig_pixels[5][1] = 64u;
+ orig_pixels[5][2] = 192u;
+ orig_pixels[5][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+ EXPECT_EQ(30u, orig_pixels[0][0]);
+ EXPECT_EQ(30u, orig_pixels[0][1]);
+ EXPECT_EQ(30u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(149u, orig_pixels[1][0]);
+ EXPECT_EQ(149u, orig_pixels[1][1]);
+ EXPECT_EQ(149u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(76u, orig_pixels[2][0]);
+ EXPECT_EQ(76u, orig_pixels[2][1]);
+ EXPECT_EQ(76u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(0u, orig_pixels[3][0]);
+ EXPECT_EQ(0u, orig_pixels[3][1]);
+ EXPECT_EQ(0u, orig_pixels[3][2]);
+ EXPECT_EQ(255u, orig_pixels[3][3]);
+ EXPECT_EQ(255u, orig_pixels[4][0]);
+ EXPECT_EQ(255u, orig_pixels[4][1]);
+ EXPECT_EQ(255u, orig_pixels[4][2]);
+ EXPECT_EQ(255u, orig_pixels[4][3]);
+ EXPECT_EQ(96u, orig_pixels[5][0]);
+ EXPECT_EQ(96u, orig_pixels[5][1]);
+ EXPECT_EQ(96u, orig_pixels[5][2]);
+ EXPECT_EQ(224u, orig_pixels[5][3]);
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBGrayTo) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 gray_pixels[256][4]);
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test black
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 255u;
+ // Test white
+ orig_pixels[4][0] = 255u;
+ orig_pixels[4][1] = 255u;
+ orig_pixels[4][2] = 255u;
+ orig_pixels[4][3] = 255u;
+ // Test color
+ orig_pixels[5][0] = 16u;
+ orig_pixels[5][1] = 64u;
+ orig_pixels[5][2] = 192u;
+ orig_pixels[5][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
+ EXPECT_EQ(30u, gray_pixels[0][0]);
+ EXPECT_EQ(30u, gray_pixels[0][1]);
+ EXPECT_EQ(30u, gray_pixels[0][2]);
+ EXPECT_EQ(128u, gray_pixels[0][3]);
+ EXPECT_EQ(149u, gray_pixels[1][0]);
+ EXPECT_EQ(149u, gray_pixels[1][1]);
+ EXPECT_EQ(149u, gray_pixels[1][2]);
+ EXPECT_EQ(0u, gray_pixels[1][3]);
+ EXPECT_EQ(76u, gray_pixels[2][0]);
+ EXPECT_EQ(76u, gray_pixels[2][1]);
+ EXPECT_EQ(76u, gray_pixels[2][2]);
+ EXPECT_EQ(255u, gray_pixels[2][3]);
+ EXPECT_EQ(0u, gray_pixels[3][0]);
+ EXPECT_EQ(0u, gray_pixels[3][1]);
+ EXPECT_EQ(0u, gray_pixels[3][2]);
+ EXPECT_EQ(255u, gray_pixels[3][3]);
+ EXPECT_EQ(255u, gray_pixels[4][0]);
+ EXPECT_EQ(255u, gray_pixels[4][1]);
+ EXPECT_EQ(255u, gray_pixels[4][2]);
+ EXPECT_EQ(255u, gray_pixels[4][3]);
+ EXPECT_EQ(96u, gray_pixels[5][0]);
+ EXPECT_EQ(96u, gray_pixels[5][1]);
+ EXPECT_EQ(96u, gray_pixels[5][2]);
+ EXPECT_EQ(224u, gray_pixels[5][3]);
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBSepia) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test black
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 255u;
+ // Test white
+ orig_pixels[4][0] = 255u;
+ orig_pixels[4][1] = 255u;
+ orig_pixels[4][2] = 255u;
+ orig_pixels[4][3] = 255u;
+ // Test color
+ orig_pixels[5][0] = 16u;
+ orig_pixels[5][1] = 64u;
+ orig_pixels[5][2] = 192u;
+ orig_pixels[5][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1);
+ EXPECT_EQ(33u, orig_pixels[0][0]);
+ EXPECT_EQ(43u, orig_pixels[0][1]);
+ EXPECT_EQ(47u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(135u, orig_pixels[1][0]);
+ EXPECT_EQ(175u, orig_pixels[1][1]);
+ EXPECT_EQ(195u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(69u, orig_pixels[2][0]);
+ EXPECT_EQ(89u, orig_pixels[2][1]);
+ EXPECT_EQ(99u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(0u, orig_pixels[3][0]);
+ EXPECT_EQ(0u, orig_pixels[3][1]);
+ EXPECT_EQ(0u, orig_pixels[3][2]);
+ EXPECT_EQ(255u, orig_pixels[3][3]);
+ EXPECT_EQ(239u, orig_pixels[4][0]);
+ EXPECT_EQ(255u, orig_pixels[4][1]);
+ EXPECT_EQ(255u, orig_pixels[4][2]);
+ EXPECT_EQ(255u, orig_pixels[4][3]);
+ EXPECT_EQ(88u, orig_pixels[5][0]);
+ EXPECT_EQ(114u, orig_pixels[5][1]);
+ EXPECT_EQ(127u, orig_pixels[5][2]);
+ EXPECT_EQ(224u, orig_pixels[5][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBColorMatrix) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+ // Matrix for Sepia.
+ static const int8 kARGBToSepia[] = {
+ 17, 68, 35, 0,
+ 22, 88, 45, 0,
+ 24, 98, 50, 0,
+ 0, 0, 0, 0, // Unused but makes matrix 16 bytes.
+ };
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1);
+ EXPECT_EQ(33u, orig_pixels[0][0]);
+ EXPECT_EQ(43u, orig_pixels[0][1]);
+ EXPECT_EQ(47u, orig_pixels[0][2]);
+ EXPECT_EQ(128u, orig_pixels[0][3]);
+ EXPECT_EQ(135u, orig_pixels[1][0]);
+ EXPECT_EQ(175u, orig_pixels[1][1]);
+ EXPECT_EQ(195u, orig_pixels[1][2]);
+ EXPECT_EQ(0u, orig_pixels[1][3]);
+ EXPECT_EQ(69u, orig_pixels[2][0]);
+ EXPECT_EQ(89u, orig_pixels[2][1]);
+ EXPECT_EQ(99u, orig_pixels[2][2]);
+ EXPECT_EQ(255u, orig_pixels[2][3]);
+ EXPECT_EQ(88u, orig_pixels[3][0]);
+ EXPECT_EQ(114u, orig_pixels[3][1]);
+ EXPECT_EQ(127u, orig_pixels[3][2]);
+ EXPECT_EQ(224u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBColorTable) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Matrix for Sepia.
+ static const uint8 kARGBTable[256 * 4] = {
+ 1u, 2u, 3u, 4u,
+ 5u, 6u, 7u, 8u,
+ 9u, 10u, 11u, 12u,
+ 13u, 14u, 15u, 16u,
+ };
+
+ orig_pixels[0][0] = 0u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 0u;
+ orig_pixels[1][0] = 1u;
+ orig_pixels[1][1] = 1u;
+ orig_pixels[1][2] = 1u;
+ orig_pixels[1][3] = 1u;
+ orig_pixels[2][0] = 2u;
+ orig_pixels[2][1] = 2u;
+ orig_pixels[2][2] = 2u;
+ orig_pixels[2][3] = 2u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 1u;
+ orig_pixels[3][2] = 2u;
+ orig_pixels[3][3] = 3u;
+ // Do 16 to test asm version.
+ ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+ EXPECT_EQ(1u, orig_pixels[0][0]);
+ EXPECT_EQ(2u, orig_pixels[0][1]);
+ EXPECT_EQ(3u, orig_pixels[0][2]);
+ EXPECT_EQ(4u, orig_pixels[0][3]);
+ EXPECT_EQ(5u, orig_pixels[1][0]);
+ EXPECT_EQ(6u, orig_pixels[1][1]);
+ EXPECT_EQ(7u, orig_pixels[1][2]);
+ EXPECT_EQ(8u, orig_pixels[1][3]);
+ EXPECT_EQ(9u, orig_pixels[2][0]);
+ EXPECT_EQ(10u, orig_pixels[2][1]);
+ EXPECT_EQ(11u, orig_pixels[2][2]);
+ EXPECT_EQ(12u, orig_pixels[2][3]);
+ EXPECT_EQ(1u, orig_pixels[3][0]);
+ EXPECT_EQ(6u, orig_pixels[3][1]);
+ EXPECT_EQ(11u, orig_pixels[3][2]);
+ EXPECT_EQ(16u, orig_pixels[3][3]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBQuantize) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ ARGBQuantize(&orig_pixels[0][0], 0,
+ (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]);
+ EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]);
+ EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]);
+ EXPECT_EQ(i, orig_pixels[i][3]);
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBQuantize(&orig_pixels[0][0], 0,
+ (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBMirror) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 dst_pixels[256][4]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i / 4;
+ }
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(i, dst_pixels[255 - i][0]);
+ EXPECT_EQ(i / 2, dst_pixels[255 - i][1]);
+ EXPECT_EQ(i / 3, dst_pixels[255 - i][2]);
+ EXPECT_EQ(i / 4, dst_pixels[255 - i][3]);
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+ }
+}
+
+TEST_F(libyuvTest, TestShade) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 shade_pixels[256][4]);
+
+ orig_pixels[0][0] = 10u;
+ orig_pixels[0][1] = 20u;
+ orig_pixels[0][2] = 40u;
+ orig_pixels[0][3] = 80u;
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 0u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 255u;
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 0u;
+ orig_pixels[2][3] = 0u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 0u;
+ orig_pixels[3][2] = 0u;
+ orig_pixels[3][3] = 0u;
+ // Do 8 pixels to allow opt version to be used.
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff);
+ EXPECT_EQ(10u, shade_pixels[0][0]);
+ EXPECT_EQ(20u, shade_pixels[0][1]);
+ EXPECT_EQ(40u, shade_pixels[0][2]);
+ EXPECT_EQ(40u, shade_pixels[0][3]);
+ EXPECT_EQ(0u, shade_pixels[1][0]);
+ EXPECT_EQ(0u, shade_pixels[1][1]);
+ EXPECT_EQ(0u, shade_pixels[1][2]);
+ EXPECT_EQ(128u, shade_pixels[1][3]);
+ EXPECT_EQ(0u, shade_pixels[2][0]);
+ EXPECT_EQ(0u, shade_pixels[2][1]);
+ EXPECT_EQ(0u, shade_pixels[2][2]);
+ EXPECT_EQ(0u, shade_pixels[2][3]);
+ EXPECT_EQ(0u, shade_pixels[3][0]);
+ EXPECT_EQ(0u, shade_pixels[3][1]);
+ EXPECT_EQ(0u, shade_pixels[3][2]);
+ EXPECT_EQ(0u, shade_pixels[3][3]);
+
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080);
+ EXPECT_EQ(5u, shade_pixels[0][0]);
+ EXPECT_EQ(10u, shade_pixels[0][1]);
+ EXPECT_EQ(20u, shade_pixels[0][2]);
+ EXPECT_EQ(40u, shade_pixels[0][3]);
+
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080);
+ EXPECT_EQ(5u, shade_pixels[0][0]);
+ EXPECT_EQ(5u, shade_pixels[0][1]);
+ EXPECT_EQ(5u, shade_pixels[0][2]);
+ EXPECT_EQ(5u, shade_pixels[0][3]);
+
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
+ 0x80808080);
+ }
+}
+
+TEST_F(libyuvTest, TestInterpolate) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
+
+ orig_pixels_0[0][0] = 16u;
+ orig_pixels_0[0][1] = 32u;
+ orig_pixels_0[0][2] = 64u;
+ orig_pixels_0[0][3] = 128u;
+ orig_pixels_0[1][0] = 0u;
+ orig_pixels_0[1][1] = 0u;
+ orig_pixels_0[1][2] = 0u;
+ orig_pixels_0[1][3] = 255u;
+ orig_pixels_0[2][0] = 0u;
+ orig_pixels_0[2][1] = 0u;
+ orig_pixels_0[2][2] = 0u;
+ orig_pixels_0[2][3] = 0u;
+ orig_pixels_0[3][0] = 0u;
+ orig_pixels_0[3][1] = 0u;
+ orig_pixels_0[3][2] = 0u;
+ orig_pixels_0[3][3] = 0u;
+
+ orig_pixels_1[0][0] = 0u;
+ orig_pixels_1[0][1] = 0u;
+ orig_pixels_1[0][2] = 0u;
+ orig_pixels_1[0][3] = 0u;
+ orig_pixels_1[1][0] = 0u;
+ orig_pixels_1[1][1] = 0u;
+ orig_pixels_1[1][2] = 0u;
+ orig_pixels_1[1][3] = 0u;
+ orig_pixels_1[2][0] = 0u;
+ orig_pixels_1[2][1] = 0u;
+ orig_pixels_1[2][2] = 0u;
+ orig_pixels_1[2][3] = 0u;
+ orig_pixels_1[3][0] = 255u;
+ orig_pixels_1[3][1] = 255u;
+ orig_pixels_1[3][2] = 255u;
+ orig_pixels_1[3][3] = 255u;
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 128);
+ EXPECT_EQ(8u, interpolate_pixels[0][0]);
+ EXPECT_EQ(16u, interpolate_pixels[0][1]);
+ EXPECT_EQ(32u, interpolate_pixels[0][2]);
+ EXPECT_EQ(64u, interpolate_pixels[0][3]);
+ EXPECT_EQ(0u, interpolate_pixels[1][0]);
+ EXPECT_EQ(0u, interpolate_pixels[1][1]);
+ EXPECT_EQ(0u, interpolate_pixels[1][2]);
+ EXPECT_NEAR(128u, interpolate_pixels[1][3], 1); // C = 127, SSE = 128.
+ EXPECT_EQ(0u, interpolate_pixels[2][0]);
+ EXPECT_EQ(0u, interpolate_pixels[2][1]);
+ EXPECT_EQ(0u, interpolate_pixels[2][2]);
+ EXPECT_EQ(0u, interpolate_pixels[2][3]);
+ EXPECT_NEAR(128u, interpolate_pixels[3][0], 1);
+ EXPECT_NEAR(128u, interpolate_pixels[3][1], 1);
+ EXPECT_NEAR(128u, interpolate_pixels[3][2], 1);
+ EXPECT_NEAR(128u, interpolate_pixels[3][3], 1);
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 0);
+ EXPECT_EQ(16u, interpolate_pixels[0][0]);
+ EXPECT_EQ(32u, interpolate_pixels[0][1]);
+ EXPECT_EQ(64u, interpolate_pixels[0][2]);
+ EXPECT_EQ(128u, interpolate_pixels[0][3]);
+
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 4, 1, 192);
+
+ EXPECT_EQ(4u, interpolate_pixels[0][0]);
+ EXPECT_EQ(8u, interpolate_pixels[0][1]);
+ EXPECT_EQ(16u, interpolate_pixels[0][2]);
+ EXPECT_EQ(32u, interpolate_pixels[0][3]);
+
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
+ &interpolate_pixels[0][0], 0, 256, 1, 128);
+ }
+}
+
+#define TESTTERP(FMT_A, BPP_A, STRIDE_A, \
+ FMT_B, BPP_B, STRIDE_B, \
+ W1280, TERP, DIFF, N, NEG, OFF) \
+TEST_F(libyuvTest, ARGBInterpolate##TERP##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_64(src_argb_a, kStrideA * kHeight + OFF); \
+ align_buffer_64(src_argb_b, kStrideA * kHeight + OFF); \
+ align_buffer_64(dst_argb_c, kStrideB * kHeight); \
+ align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kStrideA * kHeight; ++i) { \
+ src_argb_a[i + OFF] = (random() & 0xff); \
+ src_argb_b[i + OFF] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ ARGBInterpolate(src_argb_a + OFF, kStrideA, \
+ src_argb_b + OFF, kStrideA, \
+ dst_argb_c, kStrideB, \
+ kWidth, NEG kHeight, TERP); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ ARGBInterpolate(src_argb_a + OFF, kStrideA, \
+ src_argb_b + OFF, kStrideA, \
+ dst_argb_opt, kStrideB, \
+ kWidth, NEG kHeight, TERP); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeight; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(src_argb_a) \
+ free_aligned_buffer_64(src_argb_b) \
+ free_aligned_buffer_64(dst_argb_c) \
+ free_aligned_buffer_64(dst_argb_opt) \
+}
+
+#define TESTINTERPOLATE(TERP) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
+ benchmark_width_ - 1, TERP, 1, _Any, +, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
+ benchmark_width_, TERP, 1, _Unaligned, +, 1) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
+ benchmark_width_, TERP, 1, _Invert, -, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
+ benchmark_width_, TERP, 1, _Opt, +, 0) \
+ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, \
+ benchmark_width_ - 1, TERP, 1, _Any_Invert, -, 0)
+
+TESTINTERPOLATE(0)
+TESTINTERPOLATE(64)
+TESTINTERPOLATE(128)
+TESTINTERPOLATE(192)
+TESTINTERPOLATE(255)
+TESTINTERPOLATE(85)
+
+static int TestBlend(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = width * kBpp;
+ align_buffer_64(src_argb_a, kStride * height + off);
+ align_buffer_64(src_argb_b, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ src_argb_b[i + off] = (random() & 0xff);
+ }
+ ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
+ height);
+ ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
+ height);
+ memset(dst_argb_c, 255, kStride * height);
+ memset(dst_argb_opt, 255, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBBlend(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBBlend(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(src_argb_b)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBBlend_Any) {
+ int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlend_Unaligned) {
+ int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlend_Invert) {
+ int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlend_Opt) {
+ int max_diff = TestBlend(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, TestAffine) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);
+
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels_0[i][j] = i;
+ }
+ }
+
+ float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
+
+ ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
+ uv_step, 256);
+ EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
+ EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
+ EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
+
+#if defined(HAS_ARGBAFFINEROW_SSE2)
+ SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]);
+ ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+ uv_step, 256);
+ EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4));
+
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ if (has_sse2) {
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
+ uv_step, 256);
+ }
+ }
+#endif
+}
+
+TEST_F(libyuvTest, TestSobelX) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
+ SIMD_ALIGNED(uint8 orig_pixels_2[256 + 2]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
+
+ for (int i = 0; i < 256 + 2; ++i) {
+ orig_pixels_0[i] = i;
+ orig_pixels_1[i] = i * 2;
+ orig_pixels_2[i] = i * 3;
+ }
+
+ SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2,
+ sobel_pixels_c, 256);
+
+ EXPECT_EQ(16u, sobel_pixels_c[0]);
+ EXPECT_EQ(16u, sobel_pixels_c[100]);
+ EXPECT_EQ(255u, sobel_pixels_c[255]);
+
+ void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobely, int width) =
+ SobelXRow_C;
+#if defined(HAS_SOBELXROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SobelXRow = SobelXRow_SSSE3;
+ }
+#endif
+#if defined(HAS_SOBELXROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXRow = SobelXRow_NEON;
+ }
+#endif
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2,
+ sobel_pixels_opt, 256);
+ }
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ }
+}
+
+TEST_F(libyuvTest, TestSobelY) {
+ SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
+
+ for (int i = 0; i < 256 + 2; ++i) {
+ orig_pixels_0[i] = i;
+ orig_pixels_1[i] = i * 2;
+ }
+
+ SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 256);
+
+ EXPECT_EQ(4u, sobel_pixels_c[0]);
+ EXPECT_EQ(255u, sobel_pixels_c[100]);
+ EXPECT_EQ(0u, sobel_pixels_c[255]);
+ void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) = SobelYRow_C;
+#if defined(HAS_SOBELYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ SobelYRow = SobelYRow_SSSE3;
+ }
+#endif
+#if defined(HAS_SOBELYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelYRow = SobelYRow_NEON;
+ }
+#endif
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
+ }
+ for (int i = 0; i < 256; ++i) {
+ EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ }
+}
+
+TEST_F(libyuvTest, TestSobel) {
+ SIMD_ALIGNED(uint8 orig_sobelx[256]);
+ SIMD_ALIGNED(uint8 orig_sobely[256]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_sobelx[i] = i;
+ orig_sobely[i] = i * 2;
+ }
+
+ SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256);
+
+ EXPECT_EQ(0u, sobel_pixels_c[0]);
+ EXPECT_EQ(3u, sobel_pixels_c[4]);
+ EXPECT_EQ(3u, sobel_pixels_c[5]);
+ EXPECT_EQ(3u, sobel_pixels_c[6]);
+ EXPECT_EQ(255u, sobel_pixels_c[7]);
+ EXPECT_EQ(6u, sobel_pixels_c[8]);
+ EXPECT_EQ(6u, sobel_pixels_c[9]);
+ EXPECT_EQ(6u, sobel_pixels_c[10]);
+ EXPECT_EQ(255u, sobel_pixels_c[7]);
+ EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]);
+ EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]);
+ void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelRow = SobelRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelRow = SobelRow_NEON;
+ }
+#endif
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
+ }
+ for (int i = 0; i < 16; ++i) {
+ EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ }
+}
+
+TEST_F(libyuvTest, TestSobelXY) {
+ SIMD_ALIGNED(uint8 orig_sobelx[256]);
+ SIMD_ALIGNED(uint8 orig_sobely[256]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
+
+ for (int i = 0; i < 256; ++i) {
+ orig_sobelx[i] = i;
+ orig_sobely[i] = i * 2;
+ }
+
+ SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256);
+
+ EXPECT_EQ(0u, sobel_pixels_c[0]);
+ EXPECT_EQ(2u, sobel_pixels_c[4]);
+ EXPECT_EQ(3u, sobel_pixels_c[5]);
+ EXPECT_EQ(1u, sobel_pixels_c[6]);
+ EXPECT_EQ(255u, sobel_pixels_c[7]);
+ EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]);
+ EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]);
+ void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXYRow = SobelXYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelXYRow = SobelXYRow_NEON;
+ }
+#endif
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
+ }
+ for (int i = 0; i < 16; ++i) {
+ EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ }
+}
+
+TEST_F(libyuvTest, TestCopyPlane) {
+ int err = 0;
+ int yw = benchmark_width_;
+ int yh = benchmark_height_;
+ int b = 12;
+ int i, j;
+
+ int y_plane_size = (yw + b * 2) * (yh + b * 2);
+ srandom(time(NULL));
+ align_buffer_64(orig_y, y_plane_size)
+ align_buffer_64(dst_c, y_plane_size)
+ align_buffer_64(dst_opt, y_plane_size);
+
+ memset(orig_y, 0, y_plane_size);
+ memset(dst_c, 0, y_plane_size);
+ memset(dst_opt, 0, y_plane_size);
+
+ // Fill image buffers with random data.
+ for (i = b; i < (yh + b); ++i) {
+ for (j = b; j < (yw + b); ++j) {
+ orig_y[i * (yw + b * 2) + j] = random() & 0xff;
+ }
+ }
+
+ // Fill destination buffers with random data.
+ for (i = 0; i < y_plane_size; ++i) {
+ uint8 random_number = random() & 0x7f;
+ dst_c[i] = random_number;
+ dst_opt[i] = dst_c[i];
+ }
+
+ int y_off = b * (yw + b * 2) + b;
+
+ int y_st = yw + b * 2;
+ int stride = 8;
+
+ // Disable all optimizations.
+ MaskCpuFlags(0);
+ double c_time = get_time();
+ for (j = 0; j < benchmark_iterations_; j++) {
+ CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
+ }
+ c_time = (get_time() - c_time) / benchmark_iterations_;
+
+ // Enable optimizations.
+ MaskCpuFlags(-1);
+ double opt_time = get_time();
+ for (j = 0; j < benchmark_iterations_; j++) {
+ CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations_;
+ printf(" %8d us C - %8d us OPT\n",
+ static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ for (i = 0; i < y_plane_size; ++i) {
+ if (dst_c[i] != dst_opt[i])
+ ++err;
+ }
+
+ free_aligned_buffer_64(orig_y)
+ free_aligned_buffer_64(dst_c)
+ free_aligned_buffer_64(dst_opt)
+
+ EXPECT_EQ(0, err);
+}
+
+static int TestMultiply(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb_a, kStride * height + off);
+ align_buffer_64(src_argb_b, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ src_argb_b[i + off] = (random() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBMultiply(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBMultiply(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(src_argb_b)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBMultiply_Any) {
+ int max_diff = TestMultiply(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBMultiply_Unaligned) {
+ int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBMultiply_Invert) {
+ int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBMultiply_Opt) {
+ int max_diff = TestMultiply(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+static int TestAdd(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb_a, kStride * height + off);
+ align_buffer_64(src_argb_b, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ src_argb_b[i + off] = (random() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBAdd(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBAdd(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(src_argb_b)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBAdd_Any) {
+ int max_diff = TestAdd(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBAdd_Unaligned) {
+ int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBAdd_Invert) {
+ int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBAdd_Opt) {
+ int max_diff = TestAdd(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+static int TestSubtract(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb_a, kStride * height + off);
+ align_buffer_64(src_argb_b, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ src_argb_b[i + off] = (random() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBSubtract(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSubtract(src_argb_a + off, kStride,
+ src_argb_b + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(src_argb_b)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBSubtract_Any) {
+ int max_diff = TestSubtract(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBSubtract_Unaligned) {
+ int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBSubtract_Invert) {
+ int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBSubtract_Opt) {
+ int max_diff = TestSubtract(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_LE(max_diff, 1);
+}
+
+static int TestSobel(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb_a, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBSobel(src_argb_a + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSobel(src_argb_a + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBSobel_Any) {
+ int max_diff = TestSobel(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobel_Unaligned) {
+ int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobel_Invert) {
+ int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobel_Opt) {
+ int max_diff = TestSobel(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+static int TestSobelXY(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb_a, kStride * height + off);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ }
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBSobelXY(src_argb_a + off, kStride,
+ dst_argb_c, kStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSobelXY(src_argb_a + off, kStride,
+ dst_argb_opt, kStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBSobelXY_Any) {
+ int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelXY_Unaligned) {
+ int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelXY_Invert) {
+ int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelXY_Opt) {
+ int max_diff = TestSobelXY(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
new file mode 100644
index 00000000000..f73e078254c
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/rotate_argb_test.cc
@@ -0,0 +1,201 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate_argb.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+void TestRotateBpp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations,
+ const int kBpp) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height < 1) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_stride_argb = src_width * kBpp;
+ int src_argb_plane_size = src_stride_argb * src_height;
+ align_buffer_64(src_argb, src_argb_plane_size)
+ for (int i = 0; i < src_argb_plane_size; ++i) {
+ src_argb[i] = random() & 0xff;
+ }
+
+ int dst_stride_argb = dst_width * kBpp;
+ int dst_argb_plane_size = dst_stride_argb * dst_height;
+ align_buffer_64(dst_argb_c, dst_argb_plane_size)
+ align_buffer_64(dst_argb_opt, dst_argb_plane_size)
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ if (kBpp == 1) {
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ RotatePlane(src_argb, src_stride_argb,
+ dst_argb_c, dst_stride_argb,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ RotatePlane(src_argb, src_stride_argb,
+ dst_argb_opt, dst_stride_argb,
+ src_width, src_height, mode);
+ }
+ } else if (kBpp == 4) {
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ ARGBRotate(src_argb, src_stride_argb,
+ dst_argb_c, dst_stride_argb,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBRotate(src_argb, src_stride_argb,
+ dst_argb_opt, dst_stride_argb,
+ src_width, src_height, mode);
+ }
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_argb_plane_size; ++i) {
+ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]);
+ }
+
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ free_aligned_buffer_64(src_argb)
+}
+
+static void ARGBTestRotate(int src_width, int src_height,
+ int dst_width, int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations) {
+ TestRotateBpp(src_width, src_height,
+ dst_width, dst_height,
+ mode, benchmark_iterations, 4);
+}
+
+TEST_F(libyuvTest, ARGBRotate0) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate90) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate180) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate270) {
+ ARGBTestRotate(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate270, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate0_Odd) {
+ ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate90_Odd) {
+ ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate180_Odd) {
+ ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, ARGBRotate270_Odd) {
+ ARGBTestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate270, benchmark_iterations_);
+}
+
+static void TestRotatePlane(int src_width, int src_height,
+ int dst_width, int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations) {
+ TestRotateBpp(src_width, src_height,
+ dst_width, dst_height,
+ mode, benchmark_iterations, 1);
+}
+
+TEST_F(libyuvTest, RotatePlane0) {
+ TestRotatePlane(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, RotatePlane90) {
+ TestRotatePlane(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, RotatePlane180) {
+ TestRotatePlane(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, RotatePlane270) {
+ TestRotatePlane(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate270, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, RotatePlane0_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, RotatePlane90_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, RotatePlane180_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, RotatePlane270_Odd) {
+ TestRotatePlane(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate270, benchmark_iterations_);
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/rotate_test.cc b/chromium/third_party/libyuv/unit_test/rotate_test.cc
new file mode 100644
index 00000000000..730860d3fed
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/rotate_test.cc
@@ -0,0 +1,243 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static void I420TestRotate(int src_width, int src_height,
+ int dst_width, int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height < 1) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_i420_y_size = src_width * src_height;
+ int src_i420_uv_size = ((src_width + 1) / 2) * ((src_height + 1) / 2);
+ int src_i420_size = src_i420_y_size + src_i420_uv_size * 2;
+ align_buffer_64(src_i420, src_i420_size)
+ for (int i = 0; i < src_i420_size; ++i) {
+ src_i420[i] = random() & 0xff;
+ }
+
+ int dst_i420_y_size = dst_width * dst_height;
+ int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+ align_buffer_64(dst_i420_c, dst_i420_size)
+ align_buffer_64(dst_i420_opt, dst_i420_size)
+ memset(dst_i420_c, 2, dst_i420_size);
+ memset(dst_i420_opt, 3, dst_i420_size);
+
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ I420Rotate(src_i420, src_width,
+ src_i420 + src_i420_y_size, (src_width + 1) / 2,
+ src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2,
+ dst_i420_c, dst_width,
+ dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ I420Rotate(src_i420, src_width,
+ src_i420 + src_i420_y_size, (src_width + 1) / 2,
+ src_i420 + src_i420_y_size + src_i420_uv_size,
+ (src_width + 1) / 2,
+ dst_i420_opt, dst_width,
+ dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2,
+ src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i420_size; ++i) {
+ EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+ }
+
+ free_aligned_buffer_64(dst_i420_c)
+ free_aligned_buffer_64(dst_i420_opt)
+ free_aligned_buffer_64(src_i420)
+}
+
+TEST_F(libyuvTest, I420Rotate0) {
+ I420TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, I420Rotate90) {
+ I420TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, I420Rotate180) {
+ I420TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, I420Rotate270) {
+ I420TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate270, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, I420Rotate0_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, I420Rotate90_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, I420Rotate180_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, I420Rotate270_Odd) {
+ I420TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate270, benchmark_iterations_);
+}
+
+static void NV12TestRotate(int src_width, int src_height,
+ int dst_width, int dst_height,
+ libyuv::RotationMode mode,
+ int benchmark_iterations) {
+ if (src_width < 1) {
+ src_width = 1;
+ }
+ if (src_height < 1) {
+ src_height = 1;
+ }
+ if (dst_width < 1) {
+ dst_width = 1;
+ }
+ if (dst_height < 1) {
+ dst_height = 1;
+ }
+ int src_nv12_y_size = src_width * src_height;
+ int src_nv12_uv_size = ((src_width + 1) / 2) * ((src_height + 1) / 2) * 2;
+ int src_nv12_size = src_nv12_y_size + src_nv12_uv_size;
+ align_buffer_64(src_nv12, src_nv12_size)
+ for (int i = 0; i < src_nv12_size; ++i) {
+ src_nv12[i] = random() & 0xff;
+ }
+
+ int dst_i420_y_size = dst_width * dst_height;
+ int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2;
+ align_buffer_64(dst_i420_c, dst_i420_size)
+ align_buffer_64(dst_i420_opt, dst_i420_size)
+ memset(dst_i420_c, 2, dst_i420_size);
+ memset(dst_i420_opt, 3, dst_i420_size);
+
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ NV12ToI420Rotate(src_nv12, src_width,
+ src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
+ dst_i420_c, dst_width,
+ dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_c + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2,
+ src_width, src_height, mode);
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ NV12ToI420Rotate(src_nv12, src_width,
+ src_nv12 + src_nv12_y_size, (src_width + 1) & ~1,
+ dst_i420_opt, dst_width,
+ dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2,
+ dst_i420_opt + dst_i420_y_size + dst_i420_uv_size,
+ (dst_width + 1) / 2,
+ src_width, src_height, mode);
+ }
+
+ // Rotation should be exact.
+ for (int i = 0; i < dst_i420_size; ++i) {
+ EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]);
+ }
+
+ free_aligned_buffer_64(dst_i420_c)
+ free_aligned_buffer_64(dst_i420_opt)
+ free_aligned_buffer_64(src_nv12)
+}
+
+TEST_F(libyuvTest, NV12Rotate0) {
+ NV12TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, NV12Rotate90) {
+ NV12TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, NV12Rotate180) {
+ NV12TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_width_, benchmark_height_,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, NV12Rotate270) {
+ NV12TestRotate(benchmark_width_, benchmark_height_,
+ benchmark_height_, benchmark_width_,
+ kRotate270, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, NV12Rotate0_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate0, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, NV12Rotate90_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate90, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, NV12Rotate180_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_width_ - 3, benchmark_height_ - 1,
+ kRotate180, benchmark_iterations_);
+}
+
+TEST_F(libyuvTest, NV12Rotate270_Odd) {
+ NV12TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+ benchmark_height_ - 1, benchmark_width_ - 3,
+ kRotate270, benchmark_iterations_);
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
new file mode 100644
index 00000000000..7a4758594a0
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
@@ -0,0 +1,276 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale_argb.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int ARGBTestFilter(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode f, int benchmark_iterations) {
+ const int b = 128;
+ int i, j;
+ int src_argb_plane_size = (Abs(src_width) + b * 2) *
+ (Abs(src_height) + b * 2) * 4;
+ int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+ align_buffer_64(src_argb, src_argb_plane_size)
+ srandom(time(NULL));
+ MemRandomize(src_argb, src_argb_plane_size);
+
+ int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+ int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+ align_buffer_64(dst_argb_c, dst_argb_plane_size)
+ align_buffer_64(dst_argb_opt, dst_argb_plane_size)
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ // Warm up both versions for consistent benchmarks.
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ double c_time = get_time();
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n",
+ f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+ int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ free_aligned_buffer_64(src_argb)
+ return max_diff;
+}
+
+static const int kTileX = 16;
+static const int kTileY = 16;
+
+static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+ for (int y = 0; y < dst_height; y += kTileY) {
+ for (int x = 0; x < dst_width; x += kTileX) {
+ int clip_width = kTileX;
+ if (x + clip_width > dst_width) {
+ clip_width = dst_width - x;
+ }
+ int clip_height = kTileY;
+ if (y + clip_height > dst_height) {
+ clip_height = dst_height - y;
+ }
+ int r = ARGBScaleClip(src_argb, src_stride_argb,
+ src_width, src_height,
+ dst_argb, dst_stride_argb,
+ dst_width, dst_height,
+ x, y, clip_width, clip_height, filtering);
+ if (r) {
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+static int ARGBClipTestFilter(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode f, int benchmark_iterations) {
+ const int b = 128;
+ int src_argb_plane_size = (Abs(src_width) + b * 2) *
+ (Abs(src_height) + b * 2) * 4;
+ int src_stride_argb = (b * 2 + Abs(src_width)) * 4;
+
+ align_buffer_64(src_argb, src_argb_plane_size)
+ memset(src_argb, 1, src_argb_plane_size);
+
+ int dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4;
+ int dst_stride_argb = (b * 2 + dst_width) * 4;
+
+ srandom(time(NULL));
+
+ int i, j;
+ for (i = b; i < (Abs(src_height) + b); ++i) {
+ for (j = b; j < (Abs(src_width) + b) * 4; ++j) {
+ src_argb[(i * src_stride_argb) + j] = (random() & 0xff);
+ }
+ }
+
+ align_buffer_64(dst_argb_c, dst_argb_plane_size)
+ align_buffer_64(dst_argb_opt, dst_argb_plane_size)
+ memset(dst_argb_c, 2, dst_argb_plane_size);
+ memset(dst_argb_opt, 3, dst_argb_plane_size);
+
+ // Do full image, no clipping.
+ double c_time = get_time();
+ ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ c_time = (get_time() - c_time);
+
+ // Do tiled image, clipping scale to a tile at a time.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ TileARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb,
+ src_width, src_height,
+ dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb,
+ dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+
+ // Report performance of Full vs Tiled.
+ printf("filter %d - %8d us Full - %8d us Tiled\n",
+ f, static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+ // Compare full scaled image vs tiled image.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b * 4; j < (dst_width + b) * 4; ++j) {
+ int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ dst_argb_opt[(i * dst_stride_argb) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ free_aligned_buffer_64(src_argb)
+ return max_diff;
+}
+
+#define TEST_FACTOR1(name, filter, factor, max_diff) \
+ TEST_F(libyuvTest, ARGBScaleDownBy##name##_##filter) { \
+ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
+ Abs(benchmark_width_) / factor, \
+ Abs(benchmark_height_) / factor, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(libyuvTest, ARGBScaleDownClipBy##name##_##filter) { \
+ int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
+ Abs(benchmark_width_) / factor, \
+ Abs(benchmark_height_) / factor, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test a scale factor with all 2 filters. Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, factor) \
+ TEST_FACTOR1(name, None, factor, 0) \
+ TEST_FACTOR1(name, Bilinear, factor, 2)
+
+// TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
+TEST_FACTOR(1, 1)
+TEST_FACTOR(2, 2)
+TEST_FACTOR(4, 4)
+TEST_FACTOR(5, 5)
+TEST_FACTOR(8, 8)
+TEST_FACTOR(16, 16)
+TEST_FACTOR(2by3, 2 / 3)
+TEST_FACTOR(3by4, 3 / 4)
+TEST_FACTOR(3by8, 3 / 8)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+
+#define TEST_SCALETO1(width, height, filter, max_diff) \
+ TEST_F(libyuvTest, ARGBScaleTo##width##x##height##_##filter) { \
+ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
+ width, height, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(libyuvTest, ARGBScaleFrom##width##x##height##_##filter) { \
+ int diff = ARGBTestFilter(width, height, \
+ Abs(benchmark_width_), Abs(benchmark_height_), \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(libyuvTest, ARGBScaleClipTo##width##x##height##_##filter) { \
+ int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
+ width, height, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(libyuvTest, ARGBScaleClipFrom##width##x##height##_##filter) { \
+ int diff = ARGBClipTestFilter(width, height, \
+ Abs(benchmark_width_), Abs(benchmark_height_), \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test scale to a specified size with all 3 filters.
+#define TEST_SCALETO(width, height) \
+ TEST_SCALETO1(width, height, None, 0) \
+ TEST_SCALETO1(width, height, Bilinear, 2)
+
+TEST_SCALETO(640, 360)
+TEST_SCALETO(853, 480)
+TEST_SCALETO(1280, 720)
+TEST_SCALETO(1280, 800)
+TEST_SCALETO(1366, 768)
+TEST_SCALETO(1920, 1080)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc
new file mode 100644
index 00000000000..769151aa232
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/scale_test.cc
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/scale.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int TestFilter(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode f, int benchmark_iterations) {
+ int i, j;
+ const int b = 128;
+ int src_width_uv = (Abs(src_width) + 1) >> 1;
+ int src_height_uv = (Abs(src_height) + 1) >> 1;
+
+ int src_y_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2);
+ int src_uv_plane_size = (src_width_uv + b * 2) * (src_height_uv + b * 2);
+
+ int src_stride_y = b * 2 + Abs(src_width);
+ int src_stride_uv = b * 2 + src_width_uv;
+
+ align_buffer_page_end(src_y, src_y_plane_size)
+ align_buffer_page_end(src_u, src_uv_plane_size)
+ align_buffer_page_end(src_v, src_uv_plane_size)
+ srandom(time(NULL));
+ MemRandomize(src_y, src_y_plane_size);
+ MemRandomize(src_u, src_uv_plane_size);
+ MemRandomize(src_v, src_uv_plane_size);
+
+ int dst_width_uv = (dst_width + 1) >> 1;
+ int dst_height_uv = (dst_height + 1) >> 1;
+
+ int dst_y_plane_size = (dst_width + b * 2) * (dst_height + b * 2);
+ int dst_uv_plane_size = (dst_width_uv + b * 2) * (dst_height_uv + b * 2);
+
+ int dst_stride_y = b * 2 + dst_width;
+ int dst_stride_uv = b * 2 + dst_width_uv;
+
+ align_buffer_page_end(dst_y_c, dst_y_plane_size)
+ align_buffer_page_end(dst_u_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_c, dst_uv_plane_size)
+ align_buffer_page_end(dst_y_opt, dst_y_plane_size)
+ align_buffer_page_end(dst_u_opt, dst_uv_plane_size)
+ align_buffer_page_end(dst_v_opt, dst_uv_plane_size)
+
+
+ MaskCpuFlags(0); // Disable all CPU optimization.
+ double c_time = get_time();
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv,
+ src_width, src_height,
+ dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_width, dst_height, f);
+ c_time = (get_time() - c_time);
+
+ MaskCpuFlags(-1); // Enable all CPU optimization.
+ double opt_time = get_time();
+ for (i = 0; i < benchmark_iterations; ++i) {
+ I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+ src_u + (src_stride_uv * b) + b, src_stride_uv,
+ src_v + (src_stride_uv * b) + b, src_stride_uv,
+ src_width, src_height,
+ dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+ dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+ dst_width, dst_height, f);
+ }
+ opt_time = (get_time() - opt_time) / benchmark_iterations;
+ // Report performance of C vs OPT
+ printf("filter %d - %8d us C - %8d us OPT\n",
+ f,
+ static_cast<int>(c_time * 1e6),
+ static_cast<int>(opt_time * 1e6));
+
+ // C version may be a little off from the optimized. Order of
+ // operations may introduce rounding somewhere. So do a difference
+ // of the buffers and look to see that the max difference isn't
+ // over 2.
+ int max_diff = 0;
+ for (i = b; i < (dst_height + b); ++i) {
+ for (j = b; j < (dst_width + b); ++j) {
+ int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+ dst_y_opt[(i * dst_stride_y) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ for (i = b; i < (dst_height_uv + b); ++i) {
+ for (j = b; j < (dst_width_uv + b); ++j) {
+ int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+ dst_u_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+ dst_v_opt[(i * dst_stride_uv) + j]);
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ }
+
+ free_aligned_buffer_page_end(dst_y_c)
+ free_aligned_buffer_page_end(dst_u_c)
+ free_aligned_buffer_page_end(dst_v_c)
+ free_aligned_buffer_page_end(dst_y_opt)
+ free_aligned_buffer_page_end(dst_u_opt)
+ free_aligned_buffer_page_end(dst_v_opt)
+
+ free_aligned_buffer_page_end(src_y)
+ free_aligned_buffer_page_end(src_u)
+ free_aligned_buffer_page_end(src_v)
+
+ return max_diff;
+}
+
+#define TEST_FACTOR1(name, filter, factor, max_diff) \
+ TEST_F(libyuvTest, ScaleDownBy##name##_##filter) { \
+ int diff = TestFilter(benchmark_width_, benchmark_height_, \
+ Abs(benchmark_width_) / factor, \
+ Abs(benchmark_height_) / factor, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test a scale factor with all 3 filters. Expect unfiltered to be exact, but
+// filtering is different fixed point implementations for SSSE3, Neon and C.
+#define TEST_FACTOR(name, factor) \
+ TEST_FACTOR1(name, None, factor, 0) \
+ TEST_FACTOR1(name, Bilinear, factor, 2) \
+ TEST_FACTOR1(name, Box, factor, 2) \
+
+// TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
+TEST_FACTOR(1, 1)
+TEST_FACTOR(2, 2)
+TEST_FACTOR(4, 4)
+TEST_FACTOR(5, 5)
+TEST_FACTOR(8, 8)
+TEST_FACTOR(16, 16)
+TEST_FACTOR(2by3, 2 / 3)
+TEST_FACTOR(3by4, 3 / 4)
+TEST_FACTOR(3by8, 3 / 8)
+#undef TEST_FACTOR1
+#undef TEST_FACTOR
+
+#define TEST_SCALETO1(width, height, filter, max_diff) \
+ TEST_F(libyuvTest, ScaleTo##width##x##height##_##filter) { \
+ int diff = TestFilter(benchmark_width_, benchmark_height_, \
+ width, height, \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ } \
+ TEST_F(libyuvTest, ScaleFrom##width##x##height##_##filter) { \
+ int diff = TestFilter(width, height, \
+ Abs(benchmark_width_), Abs(benchmark_height_), \
+ kFilter##filter, benchmark_iterations_); \
+ EXPECT_LE(diff, max_diff); \
+ }
+
+// Test scale to a specified size with all 3 filters.
+#define TEST_SCALETO(width, height) \
+ TEST_SCALETO1(width, height, None, 0) \
+ TEST_SCALETO1(width, height, Bilinear, 2) \
+ TEST_SCALETO1(width, height, Box, 2) \
+
+TEST_SCALETO(640, 360)
+TEST_SCALETO(853, 480)
+TEST_SCALETO(1280, 720)
+TEST_SCALETO(1280, 800)
+TEST_SCALETO(1366, 768)
+TEST_SCALETO(1920, 1080)
+#undef TEST_SCALETO1
+#undef TEST_SCALETO
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/testdata/arm_v7.txt b/chromium/third_party/libyuv/unit_test/testdata/arm_v7.txt
new file mode 100644
index 00000000000..5d7dbd04802
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/testdata/arm_v7.txt
@@ -0,0 +1,12 @@
+Processor : ARMv7 Processor rev 5 (v7l)
+BogoMIPS : 795.44
+Features : swp half thumb fastmult vfp edsp iwmmxt thumbee vfpv3 vfpv3d16
+CPU implementer : 0x56
+CPU architecture: 7
+CPU variant : 0x0
+CPU part : 0x581
+CPU revision : 5
+
+Hardware : OLPC XO-1.75
+Revision : 0000
+Serial : 0000000000000000
diff --git a/chromium/third_party/libyuv/unit_test/testdata/tegra3.txt b/chromium/third_party/libyuv/unit_test/testdata/tegra3.txt
new file mode 100644
index 00000000000..d1b09f6b771
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/testdata/tegra3.txt
@@ -0,0 +1,23 @@
+Processor : ARMv7 Processor rev 9 (v7l)
+processor : 0
+BogoMIPS : 1992.29
+
+processor : 1
+BogoMIPS : 1992.29
+
+processor : 2
+BogoMIPS : 1992.29
+
+processor : 3
+BogoMIPS : 1992.29
+
+Features : swp half thumb fastmult vfp edsp neon vfpv3
+CPU implementer : 0×41
+CPU architecture: 7
+CPU variant : 0×2
+CPU part : 0xc09
+CPU revision : 9
+
+Hardware : cardhu
+Revision : 0000
+
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.cc b/chromium/third_party/libyuv/unit_test/unit_test.cc
new file mode 100644
index 00000000000..fac70262133
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/unit_test.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../unit_test/unit_test.h"
+
+#include <stdlib.h> // For getenv()
+
+#include <cstring>
+
+// Change this to 1000 for benchmarking.
+// TODO(fbarchard): Add command line parsing to pass this as option.
+#define BENCHMARK_ITERATIONS 1
+
+libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
+ benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
+ benchmark_height_(72) {
+ const char* repeat = getenv("LIBYUV_REPEAT");
+ if (repeat) {
+ benchmark_iterations_ = atoi(repeat); // NOLINT
+ // For quicker unittests, default is 128 x 72. But when benchmarking,
+ // default to 720p. Allow size to specify.
+ if (benchmark_iterations_ > 1) {
+ benchmark_width_ = 1280;
+ benchmark_height_ = 720;
+ }
+ }
+ const char* width = getenv("LIBYUV_WIDTH");
+ if (width) {
+ benchmark_width_ = atoi(width); // NOLINT
+ }
+ const char* height = getenv("LIBYUV_HEIGHT");
+ if (height) {
+ benchmark_height_ = atoi(height); // NOLINT
+ }
+ benchmark_pixels_div256_ = static_cast<int>(
+ (static_cast<double>(benchmark_width_ *
+ benchmark_height_) * benchmark_iterations_ + 255.0) / 256.0);
+}
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.h b/chromium/third_party/libyuv/unit_test/unit_test.h
new file mode 100644
index 00000000000..e81aea30780
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/unit_test.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT
+#define UNIT_TEST_UNIT_TEST_H_
+
+#include <gtest/gtest.h>
+
+#include "libyuv/basic_types.h"
+
+#define align_buffer_64(var, size) \
+ uint8* var; \
+ uint8* var##_mem; \
+ var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63)); \
+ var = reinterpret_cast<uint8*> \
+ ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63);
+
+#define free_aligned_buffer_64(var) \
+ free(var##_mem); \
+ var = 0;
+
+
+#define align_buffer_page_end(var, size) \
+ uint8* var; \
+ uint8* var##_mem; \
+ var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095) & ~4095)); \
+ var = var##_mem + (-(size) & 4095);
+
+#define free_aligned_buffer_page_end(var) \
+ free(var##_mem); \
+ var = 0;
+
+#ifdef WIN32
+#include <windows.h>
+static inline double get_time() {
+ LARGE_INTEGER t, f;
+ QueryPerformanceCounter(&t);
+ QueryPerformanceFrequency(&f);
+ return static_cast<double>(t.QuadPart) / static_cast<double>(f.QuadPart);
+}
+
+#define random rand
+#define srandom srand
+#else
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+static inline double get_time() {
+ struct timeval t;
+ struct timezone tzp;
+ gettimeofday(&t, &tzp);
+ return t.tv_sec + t.tv_usec * 1e-6;
+}
+#endif
+
+static inline void MemRandomize(uint8* dst, int len) {
+ int i;
+ for (i = 0; i < len - 3; i += 4) {
+ *reinterpret_cast<uint32*>(dst) = random();
+ dst += 4;
+ }
+ for (; i < len; ++i) {
+ *dst++ = random();
+ }
+}
+
+class libyuvTest : public ::testing::Test {
+ protected:
+ libyuvTest();
+
+ const int rotate_max_w_;
+ const int rotate_max_h_;
+
+ int benchmark_iterations_; // Default 1. Use 1000 for benchmarking.
+ int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
+ int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
+ int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+};
+
+#endif // UNIT_TEST_UNIT_TEST_H_ NOLINT
diff --git a/chromium/third_party/libyuv/unit_test/version_test.cc b/chromium/third_party/libyuv/unit_test/version_test.cc
new file mode 100644
index 00000000000..cddc0198e02
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/version_test.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/version.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// Tests SVN version against include/libyuv/version.h
+// SVN version is bumped by documentation changes as well as code.
+// Although the versions should match, once checked in, a tolerance is allowed.
+TEST_F(libyuvTest, TestVersion) {
+ EXPECT_GE(LIBYUV_VERSION, 169); // 169 is first version to support version.
+ printf("LIBYUV_VERSION %d\n", LIBYUV_VERSION);
+#ifdef LIBYUV_SVNREVISION
+ const char *ver = strchr(LIBYUV_SVNREVISION, ':');
+ if (ver) {
+ ++ver;
+ } else {
+ ver = LIBYUV_SVNREVISION;
+ }
+ int svn_revision = atoi(ver); // NOLINT
+ printf("LIBYUV_SVNREVISION %d\n", svn_revision);
+ EXPECT_NEAR(LIBYUV_VERSION, svn_revision, 20); // Allow version to be close.
+ if (LIBYUV_VERSION != svn_revision) {
+ printf("WARNING - Versions do not match.\n");
+ }
+#endif
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/video_common_test.cc b/chromium/third_party/libyuv/unit_test/video_common_test.cc
new file mode 100644
index 00000000000..379d890130d
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/video_common_test.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/video_common.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+// Tests FourCC codes in video common, which are used for ConvertToI420().
+
+static bool TestValidChar(uint32 onecc) {
+ if ((onecc >= '0' && onecc <= '9') ||
+ (onecc >= 'A' && onecc <= 'Z') ||
+ (onecc >= 'a' && onecc <= 'z') ||
+ (onecc == ' ') || (onecc == 0xff)) {
+ return true;
+ }
+ return false;
+}
+
+static bool TestValidFourCC(uint32 fourcc, int bpp) {
+ if (!TestValidChar(fourcc & 0xff) ||
+ !TestValidChar((fourcc >> 8) & 0xff) ||
+ !TestValidChar((fourcc >> 16) & 0xff) ||
+ !TestValidChar((fourcc >> 24) & 0xff)) {
+ return false;
+ }
+ if (bpp < 0 || bpp > 32) {
+ return false;
+ }
+ return true;
+}
+
+TEST_F(libyuvTest, TestCanonicalFourCC) {
+ EXPECT_EQ(FOURCC_I420, CanonicalFourCC(FOURCC_IYUV));
+ EXPECT_EQ(FOURCC_I422, CanonicalFourCC(FOURCC_YU16));
+ EXPECT_EQ(FOURCC_I444, CanonicalFourCC(FOURCC_YU24));
+ EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUYV));
+ EXPECT_EQ(FOURCC_YUY2, CanonicalFourCC(FOURCC_YUVS));
+ EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_HDYC));
+ EXPECT_EQ(FOURCC_UYVY, CanonicalFourCC(FOURCC_2VUY));
+ EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_JPEG));
+ EXPECT_EQ(FOURCC_MJPG, CanonicalFourCC(FOURCC_DMB1));
+ EXPECT_EQ(FOURCC_BGGR, CanonicalFourCC(FOURCC_BA81));
+ EXPECT_EQ(FOURCC_RAW, CanonicalFourCC(FOURCC_RGB3));
+ EXPECT_EQ(FOURCC_24BG, CanonicalFourCC(FOURCC_BGR3));
+ EXPECT_EQ(FOURCC_BGRA, CanonicalFourCC(FOURCC_CM32));
+ EXPECT_EQ(FOURCC_RAW, CanonicalFourCC(FOURCC_CM24));
+}
+
+TEST_F(libyuvTest, TestFourCC) {
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I411, FOURCC_BPP_I411));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_RGGB, FOURCC_BPP_RGGB));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_BGGR, FOURCC_BPP_BGGR));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_GRBG, FOURCC_BPP_GRBG));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_GBRG, FOURCC_BPP_GBRG));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264));
+ EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY));
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/util/Makefile b/chromium/third_party/libyuv/util/Makefile
new file mode 100644
index 00000000000..be6de3591bd
--- /dev/null
+++ b/chromium/third_party/libyuv/util/Makefile
@@ -0,0 +1,6 @@
+psnr: psnr.cc ssim.cc psnr_main.cc
+ifeq ($(CXX),icl)
+ $(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+else
+ $(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all
+endif
diff --git a/chromium/third_party/libyuv/util/compare.cc b/chromium/third_party/libyuv/util/compare.cc
new file mode 100644
index 00000000000..c36c0fa5f3f
--- /dev/null
+++ b/chromium/third_party/libyuv/util/compare.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/compare.h"
+#include "libyuv/version.h"
+
+int main(int argc, char** argv) {
+ if (argc < 1) {
+ printf("libyuv compare v%d\n", LIBYUV_VERSION);
+ printf("compare file1.yuv file2.yuv\n");
+ return -1;
+ }
+ char* name1 = argv[1];
+ char* name2 = (argc > 2) ? argv[2] : NULL;
+ FILE* fin1 = fopen(name1, "rb");
+ FILE* fin2 = name2 ? fopen(name2, "rb") : NULL;
+
+ const int kBlockSize = 32768;
+ uint8 buf1[kBlockSize];
+ uint8 buf2[kBlockSize];
+ uint32 hash1 = 5381;
+ uint32 hash2 = 5381;
+ uint64 sum_square_err = 0;
+ uint64 size_min = 0;
+ int amt1 = 0;
+ int amt2 = 0;
+ do {
+ amt1 = static_cast<int>(fread(buf1, 1, kBlockSize, fin1));
+ if (amt1 > 0) hash1 = libyuv::HashDjb2(buf1, amt1, hash1);
+ if (fin2) {
+ amt2 = static_cast<int>(fread(buf2, 1, kBlockSize, fin2));
+ if (amt2 > 0) hash2 = libyuv::HashDjb2(buf2, amt2, hash2);
+ int amt_min = (amt1 < amt2) ? amt1 : amt2;
+ size_min += amt_min;
+ sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min);
+ }
+ } while (amt1 > 0 || amt2 > 0);
+
+ printf("hash1 %x", hash1);
+ if (fin2) {
+ printf(", hash2 %x", hash2);
+ double mse = static_cast<double>(sum_square_err) /
+ static_cast<double>(size_min);
+ printf(", mse %.2f", mse);
+ double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min);
+ printf(", psnr %.2f\n", psnr);
+ fclose(fin2);
+ }
+ fclose(fin1);
+}
diff --git a/chromium/third_party/libyuv/util/convert.cc b/chromium/third_party/libyuv/util/convert.cc
new file mode 100644
index 00000000000..18316ef8efb
--- /dev/null
+++ b/chromium/third_party/libyuv/util/convert.cc
@@ -0,0 +1,365 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Convert an ARGB image to YUV.
+// Usage: convert src_argb.raw dst_yuv.raw
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/scale_argb.h"
+
+// options
+bool verbose = false;
+bool attenuate = false;
+bool unattenuate = false;
+int image_width = 0, image_height = 0; // original width and height
+int dst_width = 0, dst_height = 0; // new width and height
+int fileindex_org = 0; // argv argument contains the original file name.
+int fileindex_rec = 0; // argv argument contains the reconstructed file name.
+int num_rec = 0; // Number of reconstructed images.
+int num_skip_org = 0; // Number of frames to skip in original.
+int num_frames = 0; // Number of frames to convert.
+int filter = 1; // Bilinear filter for scaling.
+
+static __inline uint32 Abs(int32 v) {
+ return v >= 0 ? v : -v;
+}
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+bool ExtractResolutionFromFilename(const char* name,
+ int* width_ptr,
+ int* height_ptr) {
+ // Isolate the .width_height. section of the filename by searching for a
+ // dot or underscore followed by a digit.
+ for (int i = 0; name[i]; ++i) {
+ if ((name[i] == '.' || name[i] == '_') &&
+ name[i + 1] >= '0' && name[i + 1] <= '9') {
+ int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT
+ if (2 == n) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void PrintHelp(const char * program) {
+ printf("%s [-options] src_argb.raw dst_yuv.raw\n", program);
+ printf(" -s <width> <height> .... specify source resolution. "
+ "Optional if name contains\n"
+ " resolution (ie. "
+ "name.1920x800_24Hz_P420.yuv)\n"
+ " Negative value mirrors.\n");
+ printf(" -d <width> <height> .... specify destination resolution.\n");
+ printf(" -f <filter> ............ 0 = point, 1 = bilinear (default).\n");
+ printf(" -skip <src_argb> ....... Number of frame to skip of src_argb\n");
+ printf(" -frames <num> .......... Number of frames to convert\n");
+ printf(" -attenuate ............. Attenuate the ARGB image\n");
+ printf(" -unattenuate ........... Unattenuate the ARGB image\n");
+ printf(" -v ..................... verbose\n");
+ printf(" -h ..................... this help\n");
+ exit(0);
+}
+
+void ParseOptions(int argc, const char* argv[]) {
+ if (argc <= 1) PrintHelp(argv[0]);
+ for (int c = 1; c < argc; ++c) {
+ if (!strcmp(argv[c], "-v")) {
+ verbose = true;
+ } else if (!strcmp(argv[c], "-attenuate")) {
+ attenuate = true;
+ } else if (!strcmp(argv[c], "-unattenuate")) {
+ unattenuate = true;
+ } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+ PrintHelp(argv[0]);
+ } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+ image_width = atoi(argv[++c]); // NOLINT
+ image_height = atoi(argv[++c]); // NOLINT
+ } else if (!strcmp(argv[c], "-d") && c + 2 < argc) {
+ dst_width = atoi(argv[++c]); // NOLINT
+ dst_height = atoi(argv[++c]); // NOLINT
+ } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) {
+ num_skip_org = atoi(argv[++c]); // NOLINT
+ } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+ num_frames = atoi(argv[++c]); // NOLINT
+ } else if (!strcmp(argv[c], "-f") && c + 1 < argc) {
+ filter = atoi(argv[++c]); // NOLINT
+ } else if (argv[c][0] == '-') {
+ fprintf(stderr, "Unknown option. %s\n", argv[c]);
+ } else if (fileindex_org == 0) {
+ fileindex_org = c;
+ } else if (fileindex_rec == 0) {
+ fileindex_rec = c;
+ num_rec = 1;
+ } else {
+ ++num_rec;
+ }
+ }
+ if (fileindex_org == 0 || fileindex_rec == 0) {
+ fprintf(stderr, "Missing filenames\n");
+ PrintHelp(argv[0]);
+ }
+ if (num_skip_org < 0) {
+ fprintf(stderr, "Skipped frames incorrect\n");
+ PrintHelp(argv[0]);
+ }
+ if (num_frames < 0) {
+ fprintf(stderr, "Number of frames incorrect\n");
+ PrintHelp(argv[0]);
+ }
+
+ int org_width, org_height;
+ int rec_width, rec_height;
+ bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+ &org_width,
+ &org_height);
+ bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+ &rec_width,
+ &rec_height);
+ if (image_width == 0 || image_height == 0) {
+ if (org_res_avail) {
+ image_width = org_width;
+ image_height = org_height;
+ } else if (rec_res_avail) {
+ image_width = rec_width;
+ image_height = rec_height;
+ } else {
+ fprintf(stderr, "Missing dimensions.\n");
+ PrintHelp(argv[0]);
+ }
+ }
+ if (dst_width == 0 || dst_height == 0) {
+ if (rec_res_avail) {
+ dst_width = rec_width;
+ dst_height = rec_height;
+ } else {
+ dst_width = Abs(image_width);
+ dst_height = Abs(image_height);
+ }
+ }
+}
+
+static const int kTileX = 12;
+static const int kTileY = 8;
+
+static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ libyuv::FilterMode filtering) {
+ for (int y = 0; y < dst_height; y += kTileY) {
+ for (int x = 0; x < dst_width; x += kTileX) {
+ int clip_width = kTileX;
+ if (x + clip_width > dst_width) {
+ clip_width = dst_width - x;
+ }
+ int clip_height = kTileY;
+ if (y + clip_height > dst_height) {
+ clip_height = dst_height - y;
+ }
+ int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb,
+ src_width, src_height,
+ dst_argb, dst_stride_argb,
+ dst_width, dst_height,
+ x, y, clip_width, clip_height, filtering);
+ if (r) {
+ return r;
+ }
+ }
+ }
+ return 0;
+}
+
+int main(int argc, const char* argv[]) {
+ ParseOptions(argc, argv);
+
+ // Open original file (first file argument)
+ FILE* const file_org = fopen(argv[fileindex_org], "rb");
+ if (file_org == NULL) {
+ fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+ exit(1);
+ }
+
+ // Open all files to convert to
+ FILE** file_rec = new FILE* [num_rec];
+ memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb");
+ if (file_rec[cur_rec] == NULL) {
+ fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+ fclose(file_org);
+ for (int i = 0; i < cur_rec; ++i) {
+ fclose(file_rec[i]);
+ }
+ delete[] file_rec;
+ exit(1);
+ }
+ }
+
+ bool org_is_yuv = strstr(argv[fileindex_org], "_P420.") != NULL;
+ bool org_is_argb = strstr(argv[fileindex_org], "_ARGB.") != NULL;
+ if (!org_is_yuv && !org_is_argb) {
+ fprintf(stderr, "Original format unknown %s\n", argv[fileindex_org]);
+ exit(1);
+ }
+ int org_size = Abs(image_width) * Abs(image_height) * 4; // ARGB
+ // Input is YUV
+ if (org_is_yuv) {
+ const int y_size = Abs(image_width) * Abs(image_height);
+ const int uv_size = ((Abs(image_width) + 1) / 2) *
+ ((Abs(image_height) + 1) / 2);
+ org_size = y_size + 2 * uv_size; // YUV original.
+ }
+
+ const int dst_size = dst_width * dst_height * 4; // ARGB scaled
+ const int y_size = dst_width * dst_height;
+ const int uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2);
+ const size_t total_size = y_size + 2 * uv_size;
+#if defined(_MSC_VER)
+ _fseeki64(file_org,
+ static_cast<__int64>(num_skip_org) *
+ static_cast<__int64>(org_size), SEEK_SET);
+#else
+ fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+
+ uint8* const ch_org = new uint8[org_size];
+ uint8* const ch_dst = new uint8[dst_size];
+ uint8* const ch_rec = new uint8[total_size];
+ if (ch_org == NULL || ch_rec == NULL) {
+ fprintf(stderr, "No memory available\n");
+ fclose(file_org);
+ for (int i = 0; i < num_rec; ++i) {
+ fclose(file_rec[i]);
+ }
+ delete[] ch_org;
+ delete[] ch_dst;
+ delete[] ch_rec;
+ delete[] file_rec;
+ exit(1);
+ }
+
+ if (verbose) {
+ printf("Size: %dx%d to %dx%d\n", image_width, image_height,
+ dst_width, dst_height);
+ }
+
+ int number_of_frames;
+ for (number_of_frames = 0; ; ++number_of_frames) {
+ if (num_frames && number_of_frames >= num_frames)
+ break;
+
+ // Load original YUV or ARGB frame.
+ size_t bytes_org = fread(ch_org, sizeof(uint8),
+ static_cast<size_t>(org_size), file_org);
+ if (bytes_org < static_cast<size_t>(org_size))
+ break;
+
+ // TODO(fbarchard): Attenuate doesnt need to know dimensions.
+ // ARGB attenuate frame
+ if (org_is_argb && attenuate) {
+ libyuv::ARGBAttenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+ }
+ // ARGB unattenuate frame
+ if (org_is_argb && unattenuate) {
+ libyuv::ARGBUnattenuate(ch_org, 0, ch_org, 0, org_size / 4, 1);
+ }
+
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ // Scale YUV or ARGB frame.
+ if (org_is_yuv) {
+ int src_width = Abs(image_width);
+ int src_height = Abs(image_height);
+ int half_src_width = (src_width + 1) / 2;
+ int half_src_height = (src_height + 1) / 2;
+ int half_dst_width = (dst_width + 1) / 2;
+ int half_dst_height = (dst_height + 1) / 2;
+ I420Scale(ch_org, src_width,
+ ch_org + src_width * src_height, half_src_width,
+ ch_org + src_width * src_height +
+ half_src_width * half_src_height, half_src_width,
+ image_width, image_height,
+ ch_rec, dst_width,
+ ch_rec + dst_width * dst_height, half_dst_width,
+ ch_rec + dst_width * dst_height +
+ half_dst_width * half_dst_height, half_dst_width,
+ dst_width, dst_height,
+ static_cast<libyuv::FilterMode>(filter));
+ } else {
+ TileARGBScale(ch_org, Abs(image_width) * 4,
+ image_width, image_height,
+ ch_dst, dst_width * 4,
+ dst_width, dst_height,
+ static_cast<libyuv::FilterMode>(filter));
+ }
+ bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL;
+ bool rec_is_argb =
+ strstr(argv[fileindex_rec + cur_rec], "_ARGB.") != NULL;
+ if (!rec_is_yuv && !rec_is_argb) {
+ fprintf(stderr, "Output format unknown %s\n",
+ argv[fileindex_rec + cur_rec]);
+ continue; // Advance to next file.
+ }
+
+ // Convert ARGB to YUV.
+ if (!org_is_yuv && rec_is_yuv) {
+ int half_width = (dst_width + 1) / 2;
+ int half_height = (dst_height + 1) / 2;
+ libyuv::ARGBToI420(ch_dst, dst_width * 4,
+ ch_rec, dst_width,
+ ch_rec + dst_width * dst_height, half_width,
+ ch_rec + dst_width * dst_height +
+ half_width * half_height, half_width,
+ dst_width, dst_height);
+ }
+
+ // Output YUV or ARGB frame.
+ if (rec_is_yuv) {
+ size_t bytes_rec = fwrite(ch_rec, sizeof(uint8),
+ static_cast<size_t>(total_size),
+ file_rec[cur_rec]);
+ if (bytes_rec < static_cast<size_t>(total_size))
+ break;
+ } else {
+ size_t bytes_rec = fwrite(ch_dst, sizeof(uint8),
+ static_cast<size_t>(dst_size),
+ file_rec[cur_rec]);
+ if (bytes_rec < static_cast<size_t>(dst_size))
+ break;
+ }
+ if (verbose) {
+ printf("%5d", number_of_frames);
+ }
+ if (verbose) {
+ printf("\t%s", argv[fileindex_rec + cur_rec]);
+ printf("\n");
+ }
+ }
+ }
+
+ fclose(file_org);
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ fclose(file_rec[cur_rec]);
+ }
+ delete[] ch_org;
+ delete[] ch_dst;
+ delete[] ch_rec;
+ delete[] file_rec;
+ return 0;
+}
diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c
new file mode 100644
index 00000000000..8d8529ba7c6
--- /dev/null
+++ b/chromium/third_party/libyuv/util/cpuid.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define INCLUDE_LIBYUV_COMPARE_H_
+#include "libyuv.h"
+#include "./psnr.h"
+#include "./ssim.h"
+
+int main(int argc, const char* argv[]) {
+ int cpu_flags = TestCpuFlag(-1);
+ int has_arm = TestCpuFlag(kCpuHasARM);
+ int has_mips = TestCpuFlag(kCpuHasMIPS);
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+#if defined(__i386__) || defined(__x86_64__) || \
+ defined(_M_IX86) || defined(_M_X64)
+ if (has_x86) {
+ int family, model, cpu_info[4];
+ // Vendor ID:
+ // AuthenticAMD AMD processor
+ // CentaurHauls Centaur processor
+ // CyrixInstead Cyrix processor
+ // GenuineIntel Intel processor
+ // GenuineTMx86 Transmeta processor
+ // Geode by NSC National Semiconductor processor
+ // NexGenDriven NexGen processor
+ // RiseRiseRise Rise Technology processor
+ // SiS SiS SiS SiS processor
+ // UMC UMC UMC UMC processor
+ CpuId(cpu_info, 0);
+ cpu_info[0] = cpu_info[1]; // Reorder output
+ cpu_info[1] = cpu_info[3];
+ cpu_info[3] = 0;
+ printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0]));
+
+ // CPU Family and Model
+ // 3:0 - Stepping
+ // 7:4 - Model
+ // 11:8 - Family
+ // 13:12 - Processor Type
+ // 19:16 - Extended Model
+ // 27:20 - Extended Family
+ CpuId(cpu_info, 1);
+ family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
+ model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
+ printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
+ model, model);
+ }
+#endif
+ printf("Cpu Flags %x\n", cpu_flags);
+ printf("Has ARM %x\n", has_arm);
+ printf("Has MIPS %x\n", has_mips);
+ printf("Has X86 %x\n", has_x86);
+ if (has_arm) {
+ int has_neon = TestCpuFlag(kCpuHasNEON);
+ printf("Has NEON %x\n", has_neon);
+ }
+ if (has_mips) {
+ int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
+ int has_mips_dspr2 = TestCpuFlag(kCpuHasMIPS_DSPR2);
+ printf("Has MIPS DSP %x\n", has_mips_dsp);
+ printf("Has MIPS DSPR2 %x\n", has_mips_dspr2);
+ }
+ if (has_x86) {
+ int has_sse2 = TestCpuFlag(kCpuHasSSE2);
+ int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
+ int has_sse41 = TestCpuFlag(kCpuHasSSE41);
+ int has_sse42 = TestCpuFlag(kCpuHasSSE42);
+ int has_avx = TestCpuFlag(kCpuHasAVX);
+ int has_avx2 = TestCpuFlag(kCpuHasAVX2);
+ int has_erms = TestCpuFlag(kCpuHasERMS);
+ printf("Has SSE2 %x\n", has_sse2);
+ printf("Has SSSE3 %x\n", has_ssse3);
+ printf("Has SSE4.1 %x\n", has_sse41);
+ printf("Has SSE4.2 %x\n", has_sse42);
+ printf("Has AVX %x\n", has_avx);
+ printf("Has AVX2 %x\n", has_avx2);
+ printf("Has ERMS %x\n", has_erms);
+ }
+ return 0;
+}
+
diff --git a/chromium/third_party/libyuv/util/psnr.cc b/chromium/third_party/libyuv/util/psnr.cc
new file mode 100644
index 00000000000..e8fd16a3e1f
--- /dev/null
+++ b/chromium/third_party/libyuv/util/psnr.cc
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./psnr.h" // NOLINT
+
+#include <math.h>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#ifdef _MSC_VER
+#include <intrin.h> // For __cpuid()
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32; // NOLINT
+#ifdef _MSC_VER
+typedef unsigned __int64 uint64;
+#else // COMPILER_MSVC
+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long uint64; // NOLINT
+#else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
+typedef unsigned long long uint64; // NOLINT
+#endif // __LP64__
+#endif // _MSC_VER
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse)
+double ComputePSNR(double sse, double size) {
+ const double kMINSSE = 255.0 * 255.0 * size / pow(10., kMaxPSNR / 10.);
+ if (sse <= kMINSSE)
+ sse = kMINSSE; // Produces max PSNR of 128
+ return 10.0 * log10(65025.0 * size / sse);
+}
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+#define HAS_SUMSQUAREERROR_NEON
+static uint32 SumSquareError_NEON(const uint8* src_a,
+ const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile ( // NOLINT
+ "vmov.u8 q7, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+
+ "1: \n"
+ "vld1.u8 {q0}, [%0]! \n"
+ "vld1.u8 {q1}, [%1]! \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q7, d4, d4 \n"
+ "vmlal.s16 q8, d6, d6 \n"
+ "vmlal.s16 q8, d5, d5 \n"
+ "vmlal.s16 q10, d7, d7 \n"
+ "subs %2, %2, #16 \n"
+ "bhi 1b \n"
+
+ "vadd.u32 q7, q7, q8 \n"
+ "vadd.u32 q9, q9, q10 \n"
+ "vadd.u32 q10, q7, q9 \n"
+ "vpaddl.u32 q1, q10 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10");
+ return sse;
+}
+#elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_SUMSQUAREERROR_SSE2
+__declspec(naked)
+static uint32 SumSquareError_SSE2(const uint8* /*src_a*/,
+ const uint8* /*src_b*/, int /*count*/) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+ sub edx, eax
+
+ wloop:
+ movdqu xmm1, [eax]
+ movdqu xmm2, [eax + edx]
+ lea eax, [eax + 16]
+ movdqu xmm3, xmm1
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqu xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ sub ecx, 16
+ ja wloop
+
+ pshufd xmm1, xmm0, 0EEh
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 01h
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+static uint32 SumSquareError_SSE2(const uint8* src_a,
+ const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile ( // NOLINT
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ "sub %0,%1 \n"
+
+ "1: \n"
+ "movdqu (%0),%%xmm1 \n"
+ "movdqu (%0,%1,1),%%xmm2 \n"
+ "lea 0x10(%0),%0 \n"
+ "movdqu %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqu %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "sub $0x10,%2 \n"
+ "ja 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ ); // NOLINT
+ return sse;
+}
+#endif // LIBYUV_DISABLE_X86 etc
+
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile ( // NOLINT
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type));
+}
+#elif defined(__i386__) || defined(__x86_64__)
+static __inline void __cpuid(int cpu_info[4], int info_type) {
+ asm volatile ( // NOLINT
+ "cpuid \n"
+ : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
+ : "a"(info_type));
+}
+#endif
+
+static int CpuHasSSE2() {
+#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)
+ int cpu_info[4];
+ __cpuid(cpu_info, 1);
+ if (cpu_info[3] & 0x04000000) {
+ return 1;
+ }
+#endif
+ return 0;
+}
+#endif // HAS_SUMSQUAREERROR_SSE2
+
+static uint32 SumSquareError_C(const uint8* src_a,
+ const uint8* src_b, int count) {
+ uint32 sse = 0u;
+ for (int x = 0; x < count; ++x) {
+ int diff = src_a[x] - src_b[x];
+ sse += static_cast<uint32>(diff * diff);
+ }
+ return sse;
+}
+
+double ComputeSumSquareError(const uint8* src_a,
+ const uint8* src_b, int count) {
+ uint32 (*SumSquareError)(const uint8* src_a,
+ const uint8* src_b, int count) = SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ SumSquareError = SumSquareError_NEON;
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+ if (CpuHasSSE2()) {
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+ const int kBlockSize = 1 << 15;
+ uint64 sse = 0;
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+ for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ int remainder = count & (kBlockSize - 1) & ~15;
+ if (remainder) {
+ sse += SumSquareError(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & 15;
+ if (remainder) {
+ sse += SumSquareError_C(src_a, src_b, remainder);
+ }
+ return static_cast<double>(sse);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/chromium/third_party/libyuv/util/psnr.h b/chromium/third_party/libyuv/util/psnr.h
new file mode 100644
index 00000000000..2cd0b1457ce
--- /dev/null
+++ b/chromium/third_party/libyuv/util/psnr.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_PSNR_H_
+#define UTIL_PSNR_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8;
+#define UINT8_TYPE_DEFINED
+#endif
+
+static const double kMaxPSNR = 128.0;
+
+// PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse).
+// Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match).
+double ComputePSNR(double sse, double size);
+
+// Computer Sum of Squared Error (SSE).
+// Pass this to ComputePSNR for final result.
+double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // UTIL_PSNR_H_
diff --git a/chromium/third_party/libyuv/util/psnr_main.cc b/chromium/third_party/libyuv/util/psnr_main.cc
new file mode 100644
index 00000000000..9cee5f8287a
--- /dev/null
+++ b/chromium/third_party/libyuv/util/psnr_main.cc
@@ -0,0 +1,561 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get PSNR or SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+// To build: g++ -O3 -o psnr psnr.cc ssim.cc psnr_main.cc
+// or VisualC: cl /Ox psnr.cc ssim.cc psnr_main.cc
+//
+// To enable OpenMP and SSE2
+// gcc: g++ -msse2 -O3 -fopenmp -o psnr psnr.cc ssim.cc psnr_main.cc
+// vc: cl /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc
+//
+// Usage: psnr org_seq rec_seq -s width height [-skip skip_org skip_rec]
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "./psnr.h"
+#include "./ssim.h"
+
+struct metric {
+ double y, u, v, all;
+ double min_y, min_u, min_v, min_all;
+ double global_y, global_u, global_v, global_all;
+ int min_frame;
+};
+
+// options
+bool verbose = false;
+bool quiet = false;
+bool show_name = false;
+bool do_swap_uv = false;
+bool do_psnr = false;
+bool do_ssim = false;
+bool do_mse = false;
+bool do_lssim = false;
+int image_width = 0, image_height = 0;
+int fileindex_org = 0; // argv argument contains the source file name.
+int fileindex_rec = 0; // argv argument contains the destination file name.
+int num_rec = 0;
+int num_skip_org = 0;
+int num_skip_rec = 0;
+int num_frames = 0;
+#ifdef _OPENMP
+int num_threads = 0;
+#endif
+
+// Parse PYUV format. ie name.1920x800_24Hz_P420.yuv
+bool ExtractResolutionFromFilename(const char* name,
+ int* width_ptr,
+ int* height_ptr) {
+ // Isolate the .width_height. section of the filename by searching for a
+ // dot or underscore followed by a digit.
+ for (int i = 0; name[i]; ++i) {
+ if ((name[i] == '.' || name[i] == '_') &&
+ name[i + 1] >= '0' && name[i + 1] <= '9') {
+ int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT
+ if (2 == n) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// Scale Y channel from 16..240 to 0..255.
+// This can be useful when comparing codecs that are inconsistant about Y
+uint8 ScaleY(uint8 y) {
+ int ny = (y - 16) * 256 / 224;
+ if (ny < 0) ny = 0;
+ if (ny > 255) ny = 255;
+ return static_cast<uint8>(ny);
+}
+
+// MSE = Mean Square Error
+double GetMSE(double sse, double size) {
+ return sse / size;
+}
+
+void PrintHelp(const char * program) {
+ printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program);
+ printf("options:\n");
+ printf(" -s <width> <height> .... specify YUV size, mandatory if none of the "
+ "sequences have the\n");
+ printf(" resolution embedded in their filename (ie. "
+ "name.1920x800_24Hz_P420.yuv)\n");
+ printf(" -psnr .................. compute PSNR (default)\n");
+ printf(" -ssim .................. compute SSIM\n");
+ printf(" -mse ................... compute MSE\n");
+ printf(" -swap .................. Swap U and V plane\n");
+ printf(" -skip <org> <rec> ...... Number of frame to skip of org and rec\n");
+ printf(" -frames <num> .......... Number of frames to compare\n");
+#ifdef _OPENMP
+ printf(" -t <num> ............... Number of threads\n");
+#endif
+ printf(" -n ..................... Show file name\n");
+ printf(" -v ..................... verbose++\n");
+ printf(" -q ..................... quiet\n");
+ printf(" -h ..................... this help\n");
+ exit(0);
+}
+
+void ParseOptions(int argc, const char* argv[]) {
+ if (argc <= 1) PrintHelp(argv[0]);
+ for (int c = 1; c < argc; ++c) {
+ if (!strcmp(argv[c], "-v")) {
+ verbose = true;
+ } else if (!strcmp(argv[c], "-q")) {
+ quiet = true;
+ } else if (!strcmp(argv[c], "-n")) {
+ show_name = true;
+ } else if (!strcmp(argv[c], "-psnr")) {
+ do_psnr = true;
+ } else if (!strcmp(argv[c], "-mse")) {
+ do_mse = true;
+ } else if (!strcmp(argv[c], "-ssim")) {
+ do_ssim = true;
+ } else if (!strcmp(argv[c], "-lssim")) {
+ do_ssim = true;
+ do_lssim = true;
+ } else if (!strcmp(argv[c], "-swap")) {
+ do_swap_uv = true;
+ } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) {
+ PrintHelp(argv[0]);
+ } else if (!strcmp(argv[c], "-s") && c + 2 < argc) {
+ image_width = atoi(argv[++c]); // NOLINT
+ image_height = atoi(argv[++c]); // NOLINT
+ } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) {
+ num_skip_org = atoi(argv[++c]); // NOLINT
+ num_skip_rec = atoi(argv[++c]); // NOLINT
+ } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) {
+ num_frames = atoi(argv[++c]); // NOLINT
+#ifdef _OPENMP
+ } else if (!strcmp(argv[c], "-t") && c + 1 < argc) {
+ num_threads = atoi(argv[++c]); // NOLINT
+#endif
+ } else if (argv[c][0] == '-') {
+ fprintf(stderr, "Unknown option. %s\n", argv[c]);
+ } else if (fileindex_org == 0) {
+ fileindex_org = c;
+ } else if (fileindex_rec == 0) {
+ fileindex_rec = c;
+ num_rec = 1;
+ } else {
+ ++num_rec;
+ }
+ }
+ if (fileindex_org == 0 || fileindex_rec == 0) {
+ fprintf(stderr, "Missing filenames\n");
+ PrintHelp(argv[0]);
+ }
+ if (num_skip_org < 0 || num_skip_rec < 0) {
+ fprintf(stderr, "Skipped frames incorrect\n");
+ PrintHelp(argv[0]);
+ }
+ if (num_frames < 0) {
+ fprintf(stderr, "Number of frames incorrect\n");
+ PrintHelp(argv[0]);
+ }
+ if (image_width == 0 || image_height == 0) {
+ int org_width, org_height;
+ int rec_width, rec_height;
+ bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org],
+ &org_width,
+ &org_height);
+ bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec],
+ &rec_width,
+ &rec_height);
+ if (org_res_avail) {
+ if (rec_res_avail) {
+ if ((org_width == rec_width) && (org_height == rec_height)) {
+ image_width = org_width;
+ image_height = org_height;
+ } else {
+ fprintf(stderr, "Sequences have different resolutions.\n");
+ PrintHelp(argv[0]);
+ }
+ } else {
+ image_width = org_width;
+ image_height = org_height;
+ }
+ } else if (rec_res_avail) {
+ image_width = rec_width;
+ image_height = rec_height;
+ } else {
+ fprintf(stderr, "Missing dimensions.\n");
+ PrintHelp(argv[0]);
+ }
+ }
+}
+
+bool UpdateMetrics(uint8* ch_org, uint8* ch_rec,
+ const int y_size, const int uv_size, const size_t total_size,
+ int number_of_frames,
+ metric* cur_distortion_psnr,
+ metric* distorted_frame, bool do_psnr) {
+ const int uv_offset = (do_swap_uv ? uv_size : 0);
+ const uint8* const u_org = ch_org + y_size + uv_offset;
+ const uint8* const u_rec = ch_rec + y_size;
+ const uint8* const v_org = ch_org + y_size + (uv_size - uv_offset);
+ const uint8* const v_rec = ch_rec + y_size + uv_size;
+ if (do_psnr) {
+ double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size);
+ double u_err = ComputeSumSquareError(u_org, u_rec, uv_size);
+ double v_err = ComputeSumSquareError(v_org, v_rec, uv_size);
+ const double total_err = y_err + u_err + v_err;
+ cur_distortion_psnr->global_y += y_err;
+ cur_distortion_psnr->global_u += u_err;
+ cur_distortion_psnr->global_v += v_err;
+ cur_distortion_psnr->global_all += total_err;
+ distorted_frame->y = ComputePSNR(y_err, static_cast<double>(y_size));
+ distorted_frame->u = ComputePSNR(u_err, static_cast<double>(uv_size));
+ distorted_frame->v = ComputePSNR(v_err, static_cast<double>(uv_size));
+ distorted_frame->all = ComputePSNR(total_err,
+ static_cast<double>(total_size));
+ } else {
+ distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height);
+ distorted_frame->u = CalcSSIM(u_org, u_rec, image_width / 2,
+ image_height / 2);
+ distorted_frame->v = CalcSSIM(v_org, v_rec, image_width / 2,
+ image_height / 2);
+ distorted_frame->all =
+ (distorted_frame->y + distorted_frame->u + distorted_frame->v)
+ / total_size;
+ distorted_frame->y /= y_size;
+ distorted_frame->u /= uv_size;
+ distorted_frame->v /= uv_size;
+
+ if (do_lssim) {
+ distorted_frame->all = CalcLSSIM(distorted_frame->all);
+ distorted_frame->y = CalcLSSIM(distorted_frame->y);
+ distorted_frame->u = CalcLSSIM(distorted_frame->u);
+ distorted_frame->v = CalcLSSIM(distorted_frame->v);
+ }
+ }
+
+ cur_distortion_psnr->y += distorted_frame->y;
+ cur_distortion_psnr->u += distorted_frame->u;
+ cur_distortion_psnr->v += distorted_frame->v;
+ cur_distortion_psnr->all += distorted_frame->all;
+
+ bool ismin = false;
+ if (distorted_frame->y < cur_distortion_psnr->min_y)
+ cur_distortion_psnr->min_y = distorted_frame->y;
+ if (distorted_frame->u < cur_distortion_psnr->min_u)
+ cur_distortion_psnr->min_u = distorted_frame->u;
+ if (distorted_frame->v < cur_distortion_psnr->min_v)
+ cur_distortion_psnr->min_v = distorted_frame->v;
+ if (distorted_frame->all < cur_distortion_psnr->min_all) {
+ cur_distortion_psnr->min_all = distorted_frame->all;
+ cur_distortion_psnr->min_frame = number_of_frames;
+ ismin = true;
+ }
+ return ismin;
+}
+
+int main(int argc, const char* argv[]) {
+ ParseOptions(argc, argv);
+ if (!do_psnr && !do_ssim) {
+ do_psnr = true;
+ }
+
+#ifdef _OPENMP
+ if (num_threads) {
+ omp_set_num_threads(num_threads);
+ }
+ if (verbose) {
+ printf("OpenMP %d procs\n", omp_get_num_procs());
+ }
+#endif
+ // Open original file (first file argument)
+ FILE* const file_org = fopen(argv[fileindex_org], "rb");
+ if (file_org == NULL) {
+ fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]);
+ exit(1);
+ }
+
+ // Open all files to compare to
+ FILE** file_rec = new FILE* [num_rec];
+ memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb");
+ if (file_rec[cur_rec] == NULL) {
+ fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]);
+ fclose(file_org);
+ for (int i = 0; i < cur_rec; ++i) {
+ fclose(file_rec[i]);
+ }
+ delete[] file_rec;
+ exit(1);
+ }
+ }
+
+ const int y_size = image_width * image_height;
+ const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2);
+ const size_t total_size = y_size + 2 * uv_size; // NOLINT
+#if defined(_MSC_VER)
+ _fseeki64(file_org,
+ static_cast<__int64>(num_skip_org) *
+ static_cast<__int64>(total_size), SEEK_SET);
+#else
+ fseek(file_org, num_skip_org * total_size, SEEK_SET);
+#endif
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+#if defined(_MSC_VER)
+ _fseeki64(file_rec[cur_rec],
+ static_cast<__int64>(num_skip_rec) *
+ static_cast<__int64>(total_size),
+ SEEK_SET);
+#else
+ fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET);
+#endif
+ }
+
+ uint8* const ch_org = new uint8[total_size];
+ uint8* const ch_rec = new uint8[total_size];
+ if (ch_org == NULL || ch_rec == NULL) {
+ fprintf(stderr, "No memory available\n");
+ fclose(file_org);
+ for (int i = 0; i < num_rec; ++i) {
+ fclose(file_rec[i]);
+ }
+ delete[] ch_org;
+ delete[] ch_rec;
+ delete[] file_rec;
+ exit(1);
+ }
+
+ metric* const distortion_psnr = new metric[num_rec];
+ metric* const distortion_ssim = new metric[num_rec];
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+ cur_distortion_psnr->y = 0.0;
+ cur_distortion_psnr->u = 0.0;
+ cur_distortion_psnr->v = 0.0;
+ cur_distortion_psnr->all = 0.0;
+ cur_distortion_psnr->min_y = kMaxPSNR;
+ cur_distortion_psnr->min_u = kMaxPSNR;
+ cur_distortion_psnr->min_v = kMaxPSNR;
+ cur_distortion_psnr->min_all = kMaxPSNR;
+ cur_distortion_psnr->min_frame = 0;
+ cur_distortion_psnr->global_y = 0.0;
+ cur_distortion_psnr->global_u = 0.0;
+ cur_distortion_psnr->global_v = 0.0;
+ cur_distortion_psnr->global_all = 0.0;
+ distortion_ssim[cur_rec] = cur_distortion_psnr[cur_rec];
+ }
+
+ if (verbose) {
+ printf("Size: %dx%d\n", image_width, image_height);
+ }
+
+ if (!quiet) {
+ printf("Frame");
+ if (do_psnr) {
+ printf("\t PSNR-Y \t PSNR-U \t PSNR-V \t PSNR-All \t Frame");
+ }
+ if (do_ssim) {
+ printf("\t SSIM-Y\t SSIM-U\t SSIM-V\t SSIM-All\t Frame");
+ }
+ if (show_name) {
+ printf("\tName\n");
+ } else {
+ printf("\n");
+ }
+ }
+
+ int number_of_frames;
+ for (number_of_frames = 0; ; ++number_of_frames) {
+ if (num_frames && number_of_frames >= num_frames)
+ break;
+
+ size_t bytes_org = fread(ch_org, sizeof(uint8), total_size, file_org);
+ if (bytes_org < total_size)
+ break;
+
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ size_t bytes_rec = fread(ch_rec, sizeof(uint8),
+ total_size, file_rec[cur_rec]);
+ if (bytes_rec < total_size)
+ break;
+
+ if (verbose) {
+ printf("%5d", number_of_frames);
+ }
+ if (do_psnr) {
+ metric distorted_frame;
+ metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+ bool ismin = UpdateMetrics(ch_org, ch_rec,
+ y_size, uv_size, total_size,
+ number_of_frames,
+ cur_distortion_psnr,
+ &distorted_frame, true);
+ if (verbose) {
+ printf("\t%10.6f", distorted_frame.y);
+ printf("\t%10.6f", distorted_frame.u);
+ printf("\t%10.6f", distorted_frame.v);
+ printf("\t%10.6f", distorted_frame.all);
+ printf("\t%5s", ismin ? "min" : "");
+ }
+ }
+ if (do_ssim) {
+ metric distorted_frame;
+ metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+ bool ismin = UpdateMetrics(ch_org, ch_rec,
+ y_size, uv_size, total_size,
+ number_of_frames,
+ cur_distortion_ssim,
+ &distorted_frame, false);
+ if (verbose) {
+ printf("\t%10.6f", distorted_frame.y);
+ printf("\t%10.6f", distorted_frame.u);
+ printf("\t%10.6f", distorted_frame.v);
+ printf("\t%10.6f", distorted_frame.all);
+ printf("\t%5s", ismin ? "min" : "");
+ }
+ }
+ if (verbose) {
+ if (show_name) {
+ printf("\t%s", argv[fileindex_rec + cur_rec]);
+ }
+ printf("\n");
+ }
+ }
+ }
+
+ // Final PSNR computation.
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ metric* cur_distortion_psnr = &distortion_psnr[cur_rec];
+ metric* cur_distortion_ssim = &distortion_ssim[cur_rec];
+ if (number_of_frames > 0) {
+ const double norm = 1. / static_cast<double>(number_of_frames);
+ cur_distortion_psnr->y *= norm;
+ cur_distortion_psnr->u *= norm;
+ cur_distortion_psnr->v *= norm;
+ cur_distortion_psnr->all *= norm;
+ cur_distortion_ssim->y *= norm;
+ cur_distortion_ssim->u *= norm;
+ cur_distortion_ssim->v *= norm;
+ cur_distortion_ssim->all *= norm;
+ }
+
+ if (do_psnr) {
+ const double global_psnr_y = ComputePSNR(
+ cur_distortion_psnr->global_y,
+ static_cast<double>(y_size) * number_of_frames);
+ const double global_psnr_u = ComputePSNR(
+ cur_distortion_psnr->global_u,
+ static_cast<double>(uv_size) * number_of_frames);
+ const double global_psnr_v = ComputePSNR(
+ cur_distortion_psnr->global_v,
+ static_cast<double>(uv_size) * number_of_frames);
+ const double global_psnr_all = ComputePSNR(
+ cur_distortion_psnr->global_all,
+ static_cast<double>(total_size) * number_of_frames);
+ printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+ global_psnr_y,
+ global_psnr_u,
+ global_psnr_v,
+ global_psnr_all,
+ number_of_frames);
+ if (show_name) {
+ printf("\t%s", argv[fileindex_rec + cur_rec]);
+ }
+ printf("\n");
+ }
+
+ if (!quiet) {
+ printf("Avg:");
+ if (do_psnr) {
+ printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+ cur_distortion_psnr->y,
+ cur_distortion_psnr->u,
+ cur_distortion_psnr->v,
+ cur_distortion_psnr->all,
+ number_of_frames);
+ }
+ if (do_ssim) {
+ printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+ cur_distortion_ssim->y,
+ cur_distortion_ssim->u,
+ cur_distortion_ssim->v,
+ cur_distortion_ssim->all,
+ number_of_frames);
+ }
+ if (show_name) {
+ printf("\t%s", argv[fileindex_rec + cur_rec]);
+ }
+ printf("\n");
+ }
+ if (!quiet) {
+ printf("Min:");
+ if (do_psnr) {
+ printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+ cur_distortion_psnr->min_y,
+ cur_distortion_psnr->min_u,
+ cur_distortion_psnr->min_v,
+ cur_distortion_psnr->min_all,
+ cur_distortion_psnr->min_frame);
+ }
+ if (do_ssim) {
+ printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+ cur_distortion_ssim->min_y,
+ cur_distortion_ssim->min_u,
+ cur_distortion_ssim->min_v,
+ cur_distortion_ssim->min_all,
+ cur_distortion_ssim->min_frame);
+ }
+ if (show_name) {
+ printf("\t%s", argv[fileindex_rec + cur_rec]);
+ }
+ printf("\n");
+ }
+
+ if (do_mse) {
+ double global_mse_y = GetMSE(cur_distortion_psnr->global_y,
+ static_cast<double>(y_size) * number_of_frames);
+ double global_mse_u = GetMSE(cur_distortion_psnr->global_u,
+ static_cast<double>(uv_size) * number_of_frames);
+ double global_mse_v = GetMSE(cur_distortion_psnr->global_v,
+ static_cast<double>(uv_size) * number_of_frames);
+ double global_mse_all = GetMSE(cur_distortion_psnr->global_all,
+ static_cast<double>(total_size) * number_of_frames);
+ printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d",
+ global_mse_y,
+ global_mse_u,
+ global_mse_v,
+ global_mse_all,
+ number_of_frames);
+ if (show_name) {
+ printf("\t%s", argv[fileindex_rec + cur_rec]);
+ }
+ printf("\n");
+ }
+ }
+ fclose(file_org);
+ for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) {
+ fclose(file_rec[cur_rec]);
+ }
+ delete[] distortion_psnr;
+ delete[] distortion_ssim;
+ delete[] ch_org;
+ delete[] ch_rec;
+ delete[] file_rec;
+ return 0;
+}
diff --git a/chromium/third_party/libyuv/util/ssim.cc b/chromium/third_party/libyuv/util/ssim.cc
new file mode 100644
index 00000000000..277561dd00d
--- /dev/null
+++ b/chromium/third_party/libyuv/util/ssim.cc
@@ -0,0 +1,337 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./ssim.h"
+
+#include <math.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef unsigned int uint32; // NOLINT
+typedef unsigned short uint16; // NOLINT
+
+#if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \
+ (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)))
+#define __SSE2__
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+// SSIM
+enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 };
+
+// Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i)
+// The maximum value (11 x 11) must be less than 128 to avoid sign
+// problems during the calls to _mm_mullo_epi16().
+static const int K[KERNEL_SIZE] = {
+ 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i)
+};
+static const double kiW[KERNEL + 1 + 1] = {
+ 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
+ 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j]
+ 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j]
+ 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j]
+ 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j]
+};
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__)
+
+#define PWEIGHT(A, B) static_cast<uint16>(K[(A)] * K[(B)]) // weight product
+#define MAKE_WEIGHT(L) \
+ { { { PWEIGHT(L, 0), PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), \
+ PWEIGHT(L, 4), PWEIGHT(L, 5), PWEIGHT(L, 6), 0 } } }
+
+// We need this union trick to be able to initialize constant static __m128i
+// values. We can't call _mm_set_epi16() for static compile-time initialization.
+static const struct {
+ union {
+ uint16 i16_[8];
+ __m128i m_;
+ } values_;
+} W0 = MAKE_WEIGHT(0),
+ W1 = MAKE_WEIGHT(1),
+ W2 = MAKE_WEIGHT(2),
+ W3 = MAKE_WEIGHT(3);
+ // ... the rest is symmetric.
+#undef MAKE_WEIGHT
+#undef PWEIGHT
+#endif
+
+// Common final expression for SSIM, once the weighted sums are known.
+static double FinalizeSSIM(double iw, double xm, double ym,
+ double xxm, double xym, double yym) {
+ const double iwx = xm * iw;
+ const double iwy = ym * iw;
+ double sxx = xxm * iw - iwx * iwx;
+ double syy = yym * iw - iwy * iwy;
+ // small errors are possible, due to rounding. Clamp to zero.
+ if (sxx < 0.) sxx = 0.;
+ if (syy < 0.) syy = 0.;
+ const double sxsy = sqrt(sxx * syy);
+ const double sxy = xym * iw - iwx * iwy;
+ static const double C11 = (0.01 * 0.01) * (255 * 255);
+ static const double C22 = (0.03 * 0.03) * (255 * 255);
+ static const double C33 = (0.015 * 0.015) * (255 * 255);
+ const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+ const double c = (2. * sxsy + C22) / (sxx + syy + C22);
+ const double s = (sxy + C33) / (sxsy + C33);
+ return l * c * s;
+}
+
+// GetSSIM() does clipping. GetSSIMFullKernel() does not
+
+// TODO(skal): use summed tables?
+// Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1)
+// with a diff of 255, squared. The maximum error is thus 0x4388241,
+// which fits into 32 bits integers.
+double GetSSIM(const uint8 *org, const uint8 *rec,
+ int xo, int yo, int W, int H, int stride) {
+ uint32 ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+ org += (yo - KERNEL) * stride;
+ org += (xo - KERNEL);
+ rec += (yo - KERNEL) * stride;
+ rec += (xo - KERNEL);
+ for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) {
+ if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) continue;
+ const int Wy = K[y_];
+ for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) {
+ const int Wxy = Wy * K[x_];
+ if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) {
+ const int org_x = org[x_];
+ const int rec_x = rec[x_];
+ ws += Wxy;
+ xm += Wxy * org_x;
+ ym += Wxy * rec_x;
+ xxm += Wxy * org_x * org_x;
+ xym += Wxy * org_x * rec_x;
+ yym += Wxy * rec_x * rec_x;
+ }
+ }
+ }
+ return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym);
+}
+
+double GetSSIMFullKernel(const uint8 *org, const uint8 *rec,
+ int xo, int yo, int stride,
+ double area_weight) {
+ uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+
+#if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__)
+
+ org += yo * stride + xo;
+ rec += yo * stride + xo;
+ for (int y = 1; y <= KERNEL; y++) {
+ const int dy1 = y * stride;
+ const int dy2 = y * stride;
+ const int Wy = K[KERNEL + y];
+
+ for (int x = 1; x <= KERNEL; x++) {
+ // Compute the contributions of upper-left (ul), upper-right (ur)
+ // lower-left (ll) and lower-right (lr) points (see the diagram below).
+ // Symmetric Kernel will have same weight on those points.
+ // - - - - - - -
+ // - ul - - - ur -
+ // - - - - - - -
+ // - - - 0 - - -
+ // - - - - - - -
+ // - ll - - - lr -
+ // - - - - - - -
+ const int Wxy = Wy * K[KERNEL + x];
+ const int ul1 = org[-dy1 - x];
+ const int ur1 = org[-dy1 + x];
+ const int ll1 = org[dy1 - x];
+ const int lr1 = org[dy1 + x];
+
+ const int ul2 = rec[-dy2 - x];
+ const int ur2 = rec[-dy2 + x];
+ const int ll2 = rec[dy2 - x];
+ const int lr2 = rec[dy2 + x];
+
+ xm += Wxy * (ul1 + ur1 + ll1 + lr1);
+ ym += Wxy * (ul2 + ur2 + ll2 + lr2);
+ xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1);
+ xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2);
+ yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2);
+ }
+
+ // Compute the contributions of up (u), down (d), left (l) and right (r)
+ // points across the main axes (see the diagram below).
+ // Symmetric Kernel will have same weight on those points.
+ // - - - - - - -
+ // - - - u - - -
+ // - - - - - - -
+ // - l - 0 - r -
+ // - - - - - - -
+ // - - - d - - -
+ // - - - - - - -
+ const int Wxy = Wy * K[KERNEL];
+ const int u1 = org[-dy1];
+ const int d1 = org[dy1];
+ const int l1 = org[-y];
+ const int r1 = org[y];
+
+ const int u2 = rec[-dy2];
+ const int d2 = rec[dy2];
+ const int l2 = rec[-y];
+ const int r2 = rec[y];
+
+ xm += Wxy * (u1 + d1 + l1 + r1);
+ ym += Wxy * (u2 + d2 + l2 + r2);
+ xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1);
+ xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2);
+ yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2);
+ }
+
+ // Lastly the contribution of (x0, y0) point.
+ const int Wxy = K[KERNEL] * K[KERNEL];
+ const int s1 = org[0];
+ const int s2 = rec[0];
+
+ xm += Wxy * s1;
+ ym += Wxy * s2;
+ xxm += Wxy * s1 * s1;
+ xym += Wxy * s1 * s2;
+ yym += Wxy * s2 * s2;
+
+#else // __SSE2__
+
+ org += (yo - KERNEL) * stride + (xo - KERNEL);
+ rec += (yo - KERNEL) * stride + (xo - KERNEL);
+
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x = zero;
+ __m128i y = zero;
+ __m128i xx = zero;
+ __m128i xy = zero;
+ __m128i yy = zero;
+
+// Read 8 pixels at line #L, and convert to 16bit, perform weighting
+// and acccumulate.
+#define LOAD_LINE_PAIR(L, WEIGHT) do { \
+ const __m128i v0 = \
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(org + (L) * stride)); \
+ const __m128i v1 = \
+ _mm_loadl_epi64(reinterpret_cast<const __m128i*>(rec + (L) * stride)); \
+ const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \
+ const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \
+ const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \
+ const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \
+ x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \
+ y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \
+ x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \
+ y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \
+ xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \
+ xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \
+ yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \
+} while (0)
+
+#define ADD_AND_STORE_FOUR_EPI32(M, OUT) do { \
+ uint32 tmp[4]; \
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \
+ (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \
+} while (0)
+
+ LOAD_LINE_PAIR(0, W0);
+ LOAD_LINE_PAIR(1, W1);
+ LOAD_LINE_PAIR(2, W2);
+ LOAD_LINE_PAIR(3, W3);
+ LOAD_LINE_PAIR(4, W2);
+ LOAD_LINE_PAIR(5, W1);
+ LOAD_LINE_PAIR(6, W0);
+
+ ADD_AND_STORE_FOUR_EPI32(x, xm);
+ ADD_AND_STORE_FOUR_EPI32(y, ym);
+ ADD_AND_STORE_FOUR_EPI32(xx, xxm);
+ ADD_AND_STORE_FOUR_EPI32(xy, xym);
+ ADD_AND_STORE_FOUR_EPI32(yy, yym);
+
+#undef LOAD_LINE_PAIR
+#undef ADD_AND_STORE_FOUR_EPI32
+#endif
+
+ return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym);
+}
+
+static int start_max(int x, int y) { return (x > y) ? x : y; }
+
+double CalcSSIM(const uint8 *org, const uint8 *rec,
+ const int image_width, const int image_height) {
+ double SSIM = 0.;
+ const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL;
+ const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL;
+ const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X);
+ const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y);
+ const int stride = image_width;
+
+ for (int j = 0; j < KERNEL_Y; ++j) {
+ for (int i = 0; i < image_width; ++i) {
+ SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+ }
+ }
+
+#ifdef _OPENMP
+ #pragma omp parallel for reduction(+: SSIM)
+#endif
+ for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) {
+ for (int i = 0; i < KERNEL_X; ++i) {
+ SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+ }
+ for (int i = KERNEL_X; i < start_x; ++i) {
+ SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]);
+ }
+ if (start_x < image_width) {
+ // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we
+ // copy the 8 rightmost pixels on a cache area, and pad this area with
+ // zeros which won't contribute to the overall SSIM value (but we need
+ // to pass the correct normalizing constant!). By using this cache, we can
+ // still call GetSSIMFullKernel() instead of the slower GetSSIM().
+ // NOTE: we could use similar method for the left-most pixels too.
+ const int kScratchWidth = 8;
+ const int kScratchStride = kScratchWidth + KERNEL + 1;
+ uint8 scratch_org[KERNEL_SIZE * kScratchStride] = { 0 };
+ uint8 scratch_rec[KERNEL_SIZE * kScratchStride] = { 0 };
+
+ for (int k = 0; k < KERNEL_SIZE; ++k) {
+ const int offset =
+ (j - KERNEL + k) * stride + image_width - kScratchWidth;
+ memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth);
+ memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth);
+ }
+ for (int k = 0; k <= KERNEL_X + 1; ++k) {
+ SSIM += GetSSIMFullKernel(scratch_org, scratch_rec,
+ KERNEL + k, KERNEL, kScratchStride, kiW[k]);
+ }
+ }
+ }
+
+ for (int j = start_y; j < image_height; ++j) {
+ for (int i = 0; i < image_width; ++i) {
+ SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride);
+ }
+ }
+ return SSIM;
+}
+
+double CalcLSSIM(double ssim) {
+ return -10.0 * log10(1.0 - ssim);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
diff --git a/chromium/third_party/libyuv/util/ssim.h b/chromium/third_party/libyuv/util/ssim.h
new file mode 100644
index 00000000000..0689276addc
--- /dev/null
+++ b/chromium/third_party/libyuv/util/ssim.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
+
+#ifndef UTIL_SSIM_H_
+#define UTIL_SSIM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED)
+typedef unsigned char uint8;
+#define UINT8_TYPE_DEFINED
+#endif
+
+double CalcSSIM(const uint8* org, const uint8* rec,
+ const int image_width, const int image_height);
+
+// does -10.0 * log10(1.0 - ssim)
+double CalcLSSIM(double ssim);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // UTIL_SSIM_H_