summaryrefslogtreecommitdiff
path: root/chromium/third_party/libyuv
diff options
context:
space:
mode:
authorAndras Becsi <andras.becsi@digia.com>2014-03-18 13:16:26 +0100
committerFrederik Gladhorn <frederik.gladhorn@digia.com>2014-03-20 15:55:39 +0100
commit3f0f86b0caed75241fa71c95a5d73bc0164348c5 (patch)
tree92b9fb00f2e9e90b0be2262093876d4f43b6cd13 /chromium/third_party/libyuv
parente90d7c4b152c56919d963987e2503f9909a666d2 (diff)
downloadqtwebengine-chromium-3f0f86b0caed75241fa71c95a5d73bc0164348c5.tar.gz
Update to new stable branch 1750
This also includes an updated ninja and chromium dependencies needed on Windows. Change-Id: Icd597d80ed3fa4425933c9f1334c3c2e31291c42 Reviewed-by: Zoltan Arvai <zarvai@inf.u-szeged.hu> Reviewed-by: Zeno Albisser <zeno.albisser@digia.com>
Diffstat (limited to 'chromium/third_party/libyuv')
-rw-r--r--chromium/third_party/libyuv/Android.mk3
-rw-r--r--chromium/third_party/libyuv/DEPS24
-rw-r--r--chromium/third_party/libyuv/OWNERS1
-rw-r--r--chromium/third_party/libyuv/README.chromium2
-rw-r--r--chromium/third_party/libyuv/all.gyp21
-rw-r--r--chromium/third_party/libyuv/include/libyuv/convert_from_argb.h336
-rw-r--r--chromium/third_party/libyuv/include/libyuv/cpu_id.h13
-rw-r--r--chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h2
-rw-r--r--chromium/third_party/libyuv/include/libyuv/planar_functions.h76
-rw-r--r--chromium/third_party/libyuv/include/libyuv/row.h211
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale.h7
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale_argb.h14
-rw-r--r--chromium/third_party/libyuv/include/libyuv/scale_row.h273
-rw-r--r--chromium/third_party/libyuv/include/libyuv/version.h2
-rw-r--r--chromium/third_party/libyuv/libyuv.gyp5
-rw-r--r--chromium/third_party/libyuv/libyuv_test.gyp1
-rw-r--r--chromium/third_party/libyuv/linux.mk48
-rw-r--r--chromium/third_party/libyuv/source/compare.cc22
-rw-r--r--chromium/third_party/libyuv/source/compare_common.cc80
-rw-r--r--chromium/third_party/libyuv/source/compare_neon.cc122
-rw-r--r--chromium/third_party/libyuv/source/compare_posix.cc330
-rw-r--r--chromium/third_party/libyuv/source/compare_win.cc424
-rw-r--r--chromium/third_party/libyuv/source/convert.cc279
-rw-r--r--chromium/third_party/libyuv/source/convert_argb.cc108
-rw-r--r--chromium/third_party/libyuv/source/convert_from.cc248
-rw-r--r--chromium/third_party/libyuv/source/convert_from_argb.cc150
-rw-r--r--chromium/third_party/libyuv/source/convert_to_argb.cc10
-rw-r--r--chromium/third_party/libyuv/source/convert_to_i420.cc10
-rw-r--r--chromium/third_party/libyuv/source/cpu_id.cc170
-rw-r--r--chromium/third_party/libyuv/source/format_conversion.cc8
-rw-r--r--chromium/third_party/libyuv/source/mjpeg_decoder.cc3
-rw-r--r--chromium/third_party/libyuv/source/planar_functions.cc739
-rw-r--r--chromium/third_party/libyuv/source/rotate.cc34
-rw-r--r--chromium/third_party/libyuv/source/rotate_argb.cc8
-rw-r--r--chromium/third_party/libyuv/source/rotate_neon.cc8
-rw-r--r--chromium/third_party/libyuv/source/row_any.cc25
-rw-r--r--chromium/third_party/libyuv/source/row_common.cc208
-rw-r--r--chromium/third_party/libyuv/source/row_mips.cc233
-rw-r--r--chromium/third_party/libyuv/source/row_neon.cc245
-rw-r--r--chromium/third_party/libyuv/source/row_posix.cc3469
-rw-r--r--chromium/third_party/libyuv/source/row_win.cc1485
-rw-r--r--chromium/third_party/libyuv/source/row_x86.asm6
-rw-r--r--chromium/third_party/libyuv/source/scale.cc2202
-rw-r--r--chromium/third_party/libyuv/source/scale_argb.cc1184
-rw-r--r--chromium/third_party/libyuv/source/scale_argb_neon.cc142
-rw-r--r--chromium/third_party/libyuv/source/scale_common.cc657
-rw-r--r--chromium/third_party/libyuv/source/scale_mips.cc21
-rw-r--r--chromium/third_party/libyuv/source/scale_neon.cc198
-rw-r--r--chromium/third_party/libyuv/source/scale_posix.cc1337
-rw-r--r--chromium/third_party/libyuv/source/scale_win.cc1289
-rwxr-xr-xchromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py8
-rw-r--r--chromium/third_party/libyuv/unit_test/compare_test.cc8
-rw-r--r--chromium/third_party/libyuv/unit_test/convert_test.cc1987
-rw-r--r--chromium/third_party/libyuv/unit_test/cpu_test.cc14
-rw-r--r--chromium/third_party/libyuv/unit_test/math_test.cc114
-rw-r--r--chromium/third_party/libyuv/unit_test/planar_test.cc946
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_argb_test.cc85
-rw-r--r--chromium/third_party/libyuv/unit_test/scale_test.cc77
-rw-r--r--chromium/third_party/libyuv/unit_test/unit_test.cc15
-rw-r--r--chromium/third_party/libyuv/unit_test/unit_test.h23
-rw-r--r--chromium/third_party/libyuv/util/convert.cc4
-rw-r--r--chromium/third_party/libyuv/util/cpuid.c8
-rw-r--r--chromium/third_party/libyuv/util/psnr.h4
-rw-r--r--chromium/third_party/libyuv/util/ssim.cc2
-rw-r--r--chromium/third_party/libyuv/util/ssim.h4
-rw-r--r--chromium/third_party/libyuv/winarm.mk43
66 files changed, 12329 insertions, 7506 deletions
diff --git a/chromium/third_party/libyuv/Android.mk b/chromium/third_party/libyuv/Android.mk
index 513a1961b5c..3d8ba49a318 100644
--- a/chromium/third_party/libyuv/Android.mk
+++ b/chromium/third_party/libyuv/Android.mk
@@ -27,7 +27,9 @@ LOCAL_SRC_FILES := \
source/row_posix.cc \
source/scale.cc \
source/scale_argb.cc \
+ source/scale_common.cc \
source/scale_mips.cc \
+ source/scale_posix.cc \
source/video_common.cc
# TODO(fbarchard): Enable mjpeg encoder.
@@ -41,7 +43,6 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
source/compare_neon.cc.neon \
source/rotate_neon.cc.neon \
source/row_neon.cc.neon \
- source/scale_argb_neon.cc.neon \
source/scale_neon.cc.neon
endif
diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS
index eafc459c3f3..7e866873c45 100644
--- a/chromium/third_party/libyuv/DEPS
+++ b/chromium/third_party/libyuv/DEPS
@@ -13,7 +13,7 @@ vars = {
"googlecode_url": "http://%s.googlecode.com/svn",
"chromium_trunk" : "http://src.chromium.org/svn/trunk",
# chrome://version/ for revision of canary Chrome.
- "chromium_revision": "202548",
+ "chromium_revision": "232627",
}
# NOTE: Prefer revision numbers to tags for svn deps. Use http rather than
@@ -78,6 +78,26 @@ deps_os = {
"third_party/gold":
From("chromium_deps", "src/third_party/gold"),
},
+ "android": {
+ "third_party/android_tools":
+ From("chromium_deps", "src/third_party/android_tools"),
+ },
+ "ios": {
+ # NSS, for SSLClientSocketNSS.
+ "third_party/nss":
+ From("chromium_deps", "src/third_party/nss"),
+
+ "net/third_party/nss":
+ Var("chromium_trunk") + "/src/net/third_party/nss@" + Var("chromium_revision"),
+
+ # class-dump utility to generate header files for undocumented SDKs.
+ "testing/iossim/third_party/class-dump":
+ From("chromium_deps", "src/testing/iossim/third_party/class-dump"),
+
+ # Helper for running under the simulator.
+ "testing/iossim":
+ Var("chromium_trunk") + "/src/testing/iossim@" + Var("chromium_revision"),
+ },
}
hooks = [
@@ -92,7 +112,7 @@ hooks = [
# A change to a .gyp, .gypi, or to GYP itself should run the generator.
"pattern": ".",
"action": ["python", Var("root_dir") + "/build/gyp_chromium",
- "--depth=" + Var("root_dir"), Var("root_dir") + "/libyuv_test.gyp",
+ "--depth=" + Var("root_dir"), Var("root_dir") + "/all.gyp",
Var("extra_gyp_flag")],
},
{
diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS
index cbe985ecfdd..df673dfa5e5 100644
--- a/chromium/third_party/libyuv/OWNERS
+++ b/chromium/third_party/libyuv/OWNERS
@@ -1,3 +1,2 @@
fbarchard@chromium.org
mflodman@chromium.org
-
diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium
index edc5d82ba88..f11363cc425 100644
--- a/chromium/third_party/libyuv/README.chromium
+++ b/chromium/third_party/libyuv/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 723
+Version: 911
License: BSD
License File: LICENSE
diff --git a/chromium/third_party/libyuv/all.gyp b/chromium/third_party/libyuv/all.gyp
new file mode 100644
index 00000000000..cc72d9d6fc3
--- /dev/null
+++ b/chromium/third_party/libyuv/all.gyp
@@ -0,0 +1,21 @@
+# Copyright 2013 The LibYuv Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS. All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+
+# all.gyp and All target are for benefit of android gyp build.
+{
+ 'targets': [
+ {
+ 'target_name': 'All',
+ 'type': 'none',
+ 'dependencies': [
+ 'libyuv.gyp:*',
+ 'libyuv_test.gyp:*',
+ ],
+ },
+ ],
+}
diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
index be3bba44433..f0343a77d3e 100644
--- a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -1,168 +1,168 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
-#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB to ARGB.
-#define ARGBToARGB ARGBCopy
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To BGRA. (alias)
-#define ARGBToBGRA BGRAToARGB
-LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To ABGR. (alias)
-#define ARGBToABGR ABGRToARGB
-LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height);
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb, int dst_stride_rgb,
- int width, int height);
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height);
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height);
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height);
-
-// Convert ARGB To I444.
-LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB To I422.
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB To I420. (also in convert.h)
-LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB To I411.
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB to J400. (JPeg full range).
-LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- int width, int height);
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
-
-// Convert ARGB To NV12.
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height);
-
-// Convert ARGB To YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height);
-
-// Convert ARGB To UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To BGRA. (alias)
+#define ARGBToBGRA BGRAToARGB
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To ABGR. (alias)
+#define ARGBToABGR ABGRToARGB
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb, int dst_stride_rgb,
+ int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
index 8b6d043222b..79da994c744 100644
--- a/chromium/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h
@@ -18,6 +18,7 @@ namespace libyuv {
extern "C" {
#endif
+// TODO(fbarchard): Consider overlapping bits for different architectures.
// Internal flag to indicate cpuid requires initialization.
static const int kCpuInit = 0x1;
@@ -35,11 +36,13 @@ static const int kCpuHasSSE42 = 0x100;
static const int kCpuHasAVX = 0x200;
static const int kCpuHasAVX2 = 0x400;
static const int kCpuHasERMS = 0x800;
+static const int kCpuHasFMA3 = 0x1000;
+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
// These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x1000;
-static const int kCpuHasMIPS_DSP = 0x2000;
-static const int kCpuHasMIPS_DSPR2 = 0x4000;
+static const int kCpuHasMIPS = 0x10000;
+static const int kCpuHasMIPS_DSP = 0x20000;
+static const int kCpuHasMIPS_DSPR2 = 0x40000;
// Internal function used to auto-init.
LIBYUV_API
@@ -65,8 +68,10 @@ LIBYUV_API
void MaskCpuFlags(int enable_flags);
// Low level cpuid for X86. Returns zeros on other CPUs.
+// eax is the info type that you want.
+// ecx is typically the cpu number, and should normally be zero.
LIBYUV_API
-void CpuId(int cpu_info[4], int info_type);
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index e53c1fe1e2e..7bb82fce146 100644
--- a/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -45,7 +45,7 @@ struct SetJmpErrorMgr;
// MJPEG frames.
//
// See http://tools.ietf.org/html/rfc2435
-class MJpegDecoder {
+class LIBYUV_API MJpegDecoder {
public:
typedef void (*CallbackFunction)(void* opaque,
const uint8* const* data,
diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
index cb14678a8b3..1d54ddec147 100644
--- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h
@@ -72,6 +72,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
int width, int height);
// Convert UYVY to I422.
+LIBYUV_API
int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
@@ -187,14 +188,27 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
int x, int y, int width, int height);
// Apply a matrix rotation to each ARGB pixel.
+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
+// The first 4 coefficients apply to B, G, R, A and produce B of the output.
+// The next 4 coefficients apply to B, G, R, A and produce G of the output.
+// The next 4 coefficients apply to B, G, R, A and produce R of the output.
+// The last 4 coefficients apply to B, G, R, A and produce A of the output.
+LIBYUV_API
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_argb,
+ int width, int height);
+
+// Deprecated. Use ARGBColorMatrix instead.
+// Apply a matrix rotation to each ARGB pixel.
// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.
// The first 4 coefficients apply to B, G, R, A and produce B of the output.
// The next 4 coefficients apply to B, G, R, A and produce G of the output.
// The last 4 coefficients apply to B, G, R, A and produce R of the output.
LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
- const int8* matrix_argb,
- int x, int y, int width, int height);
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_rgb,
+ int x, int y, int width, int height);
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
@@ -203,6 +217,36 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
const uint8* table_argb,
int x, int y, int width, int height);
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int x, int y, int width, int height);
+
+// Apply a luma/color table each ARGB pixel but preserve destination alpha.
+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
+// RGB (YJ style) and C is an 8 bit color component (R, G or B).
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* luma_rgb_table,
+ int width, int height);
+
+// Apply a 3 term polynomial to ARGB values.
+// poly points to a 4x4 matrix. The first row is constants. The 2nd row is
+// coefficients for b, g, r and a. The 3rd row is coefficients for b squared,
+// g squared, r squared and a squared. The 4rd row is coefficients for b to
+// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and
+// result clamped to 0 to 255.
+// A polynomial approximation can be dirived using software such as 'R'.
+
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const float* poly,
+ int width, int height);
+
// Quantize a rectangle of ARGB. Alpha unaffected.
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
// interval_size should be a value between 1 and 255.
@@ -218,6 +262,18 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height);
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width);
@@ -288,6 +344,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
uint8* argb, int argb_stride,
int w, int h, int dw, int dh);
+// Internal function - do not call directly.
// Computes table of cumulative sum for image where the value is the sum
// of all values above and to the left of the entry. Used by ARGBBlur.
LIBYUV_API
@@ -296,8 +353,11 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
int width, int height);
// Blur ARGB image.
-// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned
-// to 16 byte boundary.
+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to
+// 16 byte boundary.
+// dst_stride32_cumsum is number of ints in a row (width * 4).
+// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5.
+// Blur is optimized for radius of 5 (11x11) or less.
LIBYUV_API
int ARGBBlur(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
@@ -347,6 +407,12 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
uint8* dst_argb, int dst_stride_argb,
const uint8* shuffler, int width, int height);
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
// Sobel ARGB effect.
LIBYUV_API
int ARGBSobel(const uint8* src_argb, int src_stride_argb,
diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h
index 3416661742f..b6056fdca9b 100644
--- a/chromium/third_party/libyuv/include/libyuv/row.h
+++ b/chromium/third_party/libyuv/include/libyuv/row.h
@@ -38,32 +38,66 @@ extern "C" {
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-// Conversions.
+// Effects:
+#define HAS_ARGBADDROW_SSE2
+#define HAS_ARGBAFFINEROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSSE3
+#define HAS_ARGBBLENDROW_SSSE3
+#define HAS_ARGBCOLORMATRIXROW_SSSE3
+#define HAS_ARGBCOLORTABLEROW_X86
+#define HAS_ARGBCOPYALPHAROW_SSE2
+#define HAS_ARGBCOPYYTOALPHAROW_SSE2
+#define HAS_ARGBGRAYROW_SSSE3
+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3
+#define HAS_ARGBMIRRORROW_SSSE3
+#define HAS_ARGBMULTIPLYROW_SSE2
+#define HAS_ARGBPOLYNOMIALROW_SSE2
+#define HAS_ARGBQUANTIZEROW_SSE2
+#define HAS_ARGBSEPIAROW_SSSE3
+#define HAS_ARGBSHADEROW_SSE2
+#define HAS_ARGBSUBTRACTROW_SSE2
+#define HAS_ARGBTOUVROW_SSSE3
+#define HAS_ARGBUNATTENUATEROW_SSE2
+#define HAS_COMPUTECUMULATIVESUMROW_SSE2
+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
+#define HAS_INTERPOLATEROW_SSE2
+#define HAS_INTERPOLATEROW_SSSE3
+#define HAS_RGBCOLORTABLEROW_X86
+#define HAS_SOBELROW_SSE2
+#define HAS_SOBELTOPLANEROW_SSE2
+#define HAS_SOBELXROW_SSE2
+#define HAS_SOBELXYROW_SSE2
+#define HAS_SOBELYROW_SSE2
+
+// Conversions:
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBSHUFFLEROW_SSE2
#define HAS_ARGBSHUFFLEROW_SSSE3
#define HAS_ARGBTOARGB1555ROW_SSE2
#define HAS_ARGBTOARGB4444ROW_SSE2
+#define HAS_ARGBTOBAYERGGROW_SSE2
#define HAS_ARGBTOBAYERROW_SSSE3
#define HAS_ARGBTORAWROW_SSSE3
#define HAS_ARGBTORGB24ROW_SSSE3
#define HAS_ARGBTORGB565ROW_SSE2
#define HAS_ARGBTOUV422ROW_SSSE3
#define HAS_ARGBTOUV444ROW_SSSE3
-#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
-#define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYJROW_SSSE3
+#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
+#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
#define HAS_COPYROW_X86
-#define HAS_COPYROW_ERMS
+#define HAS_FIXEDDIV_X86
#define HAS_HALFROW_SSE2
#define HAS_I400TOARGBROW_SSE2
#define HAS_I411TOARGBROW_SSSE3
+#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOABGRROW_SSSE3
#define HAS_I422TOARGB1555ROW_SSSE3
#define HAS_I422TOARGB4444ROW_SSSE3
@@ -77,7 +111,9 @@ extern "C" {
#define HAS_I422TOYUY2ROW_SSE2
#define HAS_I444TOARGBROW_SSSE3
#define HAS_MERGEUVROW_SSE2
+#define HAS_MIRRORROW_SSE2
#define HAS_MIRRORROW_SSSE3
+#define HAS_MIRRORROW_UV_SSSE3
#define HAS_MIRRORUVROW_SSSE3
#define HAS_NV12TOARGBROW_SSSE3
#define HAS_NV12TORGB565ROW_SSSE3
@@ -101,43 +137,48 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_SSE2
#define HAS_YUY2TOUVROW_SSE2
#define HAS_YUY2TOYROW_SSE2
-
-// Effects
-#define HAS_ARGBADDROW_SSE2
-#define HAS_ARGBAFFINEROW_SSE2
-#define HAS_ARGBATTENUATEROW_SSSE3
-#define HAS_ARGBBLENDROW_SSSE3
-#define HAS_ARGBCOLORMATRIXROW_SSSE3
-#define HAS_ARGBGRAYROW_SSSE3
-#define HAS_ARGBMIRRORROW_SSSE3
-#define HAS_ARGBMULTIPLYROW_SSE2
-#define HAS_ARGBQUANTIZEROW_SSE2
-#define HAS_ARGBSEPIAROW_SSSE3
-#define HAS_ARGBSHADEROW_SSE2
-#define HAS_ARGBSUBTRACTROW_SSE2
-#define HAS_ARGBUNATTENUATEROW_SSE2
-#define HAS_COMPUTECUMULATIVESUMROW_SSE2
-#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-#define HAS_INTERPOLATEROW_SSE2
-#define HAS_INTERPOLATEROW_SSSE3
-#define HAS_SOBELROW_SSE2
-#define HAS_SOBELXROW_SSSE3
-#define HAS_SOBELXYROW_SSE2
-#define HAS_SOBELYROW_SSSE3
#endif
-// The following are Windows only.
-// TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_ARGBCOLORTABLEROW_X86
+// GCC >= 4.7.0 required for AVX2.
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
+#define GCC_HAS_AVX2 1
+#endif // GNUC >= 4.7
+#endif // __GNUC__
+
+// clang >= 3.4.0 required for AVX2.
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))
+#define CLANG_HAS_AVX2 1
+#endif // clang >= 3.4
+#endif // __clang__
+
// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
+#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700
+#define VISUALC_HAS_AVX2 1
+#endif // VisualStudio >= 2012
+
+// The following are available on all x86 platforms, but
+// require VS2012, clang 3.4 or gcc 4.7.
+// The code supports NaCL but requires a new compiler and validator.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
+ defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+// Effects:
+#define HAS_ARGBPOLYNOMIALROW_AVX2
#define HAS_ARGBSHUFFLEROW_AVX2
+#define HAS_ARGBCOPYALPHAROW_AVX2
+#define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#endif
+
+// The following are require VS2012.
+// TODO(fbarchard): Port to gcc.
+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)
#define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#define HAS_HALFROW_AVX2
#define HAS_I422TOARGBROW_AVX2
+#define HAS_INTERPOLATEROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_SPLITUVROW_AVX2
@@ -148,17 +189,16 @@ extern "C" {
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOYROW_AVX2
-// Effects
+// Effects:
#define HAS_ARGBADDROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBMIRRORROW_AVX2
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
-#endif
-#endif
+#endif // defined(VISUALC_HAS_AVX2)
-// The following are Yasm x86 only.
+// The following are Yasm x86 only:
// TODO(fbarchard): Port AVX2 to inline.
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
(defined(_M_IX86) || defined(_M_X64) || \
@@ -177,12 +217,12 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBATTENUATEROW_SSE2
#define HAS_ARGBBLENDROW_SSE2
+#define HAS_ARGBATTENUATEROW_SSE2
#define HAS_MIRRORROW_SSE2
#endif
-// The following are available on Neon platforms
+// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_ABGRTOUVROW_NEON
@@ -255,7 +295,7 @@ extern "C" {
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TOYROW_NEON
-// Effects
+// Effects:
#define HAS_ARGBADDROW_NEON
#define HAS_ARGBATTENUATEROW_NEON
#define HAS_ARGBBLENDROW_NEON
@@ -268,13 +308,14 @@ extern "C" {
#define HAS_ARGBSHADEROW_NEON
#define HAS_ARGBSUBTRACTROW_NEON
#define HAS_SOBELROW_NEON
+#define HAS_SOBELTOPLANEROW_NEON
#define HAS_SOBELXYROW_NEON
#define HAS_SOBELXROW_NEON
#define HAS_SOBELYROW_NEON
#define HAS_INTERPOLATEROW_NEON
#endif
-// The following are available on Mips platforms
+// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
#define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
@@ -304,6 +345,7 @@ typedef __declspec(align(32)) uint32 ulvec32[8];
typedef __declspec(align(32)) uint8 ulvec8[32];
#elif defined(__GNUC__)
+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
typedef int16 __attribute__((vector_size(16))) vec16;
typedef int32 __attribute__((vector_size(16))) vec32;
@@ -327,6 +369,14 @@ typedef uint8 uvec8[16];
#define OMITFP __attribute__((optimize("omit-frame-pointer")))
#endif
+// For functions that use rowbuffer and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -655,6 +705,14 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count);
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
void CopyRow_C(const uint8* src, uint8* dst, int count);
+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
+
+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
+
void SetRow_X86(uint8* dst, uint32 v32, int count);
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
int dst_stride, int height);
@@ -668,6 +726,8 @@ void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,
// ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
@@ -676,6 +736,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix);
void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
@@ -1338,8 +1400,16 @@ void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix);
void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix);
+void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
+void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /* selector */, int pix);
void I422ToYUY2Row_C(const uint8* src_y,
const uint8* src_u,
@@ -1398,7 +1468,7 @@ void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
int width);
// Inverse table for unattenuate, shared by C and SSE2.
-extern uint32 fixed_invtbl8[256];
+extern const uint32 fixed_invtbl8[256];
void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
@@ -1415,15 +1485,19 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width);
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
- int width);
-void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
- int width);
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width);
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width);
void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
+
void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width);
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
@@ -1466,6 +1540,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
@@ -1487,6 +1564,9 @@ void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride_ptr, int width,
+ int source_y_fraction);
void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride_ptr, int width,
int source_y_fraction);
@@ -1494,14 +1574,14 @@ void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
// Sobel images.
void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
uint8* dst_sobelx, int width);
-void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width);
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width);
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width);
void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width);
-void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width);
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width);
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width);
void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
@@ -1510,6 +1590,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width);
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width);
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width);
void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
@@ -1517,6 +1603,31 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width);
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width);
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width);
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width);
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ const uint8* luma, const uint32 lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width, const uint8* luma,
+ const uint32 lumacoeff);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#else
+#define FixedDiv FixedDiv_C
+#endif
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libyuv/include/libyuv/scale.h
index b1efc95d2fd..b672dbfcee8 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale.h
@@ -18,11 +18,12 @@ namespace libyuv {
extern "C" {
#endif
-// Supported filtering
+// Supported filtering.
enum FilterMode {
kFilterNone = 0, // Point sample; Fastest.
- kFilterBilinear = 1, // Faster than box, but lower quality scaling down.
- kFilterBox = 2 // Highest quality.
+ kFilterLinear = 1, // Filter horizontally only.
+ kFilterBilinear = 2, // Faster than box, but lower quality scaling down.
+ kFilterBox = 3 // Highest quality.
};
// Scale a YUV plane.
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_argb.h b/chromium/third_party/libyuv/include/libyuv/scale_argb.h
index b6f510522e7..0c9b3625757 100644
--- a/chromium/third_party/libyuv/include/libyuv/scale_argb.h
+++ b/chromium/third_party/libyuv/include/libyuv/scale_argb.h
@@ -35,6 +35,20 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
int clip_x, int clip_y, int clip_width, int clip_height,
enum FilterMode filtering);
+// TODO(fbarchard): Implement this.
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint32 src_fourcc,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ uint32 dst_fourcc,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering);
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h
new file mode 100644
index 00000000000..23c4e90791f
--- /dev/null
+++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ROW_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SCALEROWDOWN2_SSE2
+#define HAS_SCALEROWDOWN4_SSE2
+#define HAS_SCALEROWDOWN34_SSSE3
+#define HAS_SCALEROWDOWN38_SSSE3
+#define HAS_SCALEADDROWS_SSE2
+#define HAS_SCALEFILTERCOLS_SSSE3
+#define HAS_SCALECOLSUP2_SSE2
+#define HAS_SCALEARGBROWDOWN2_SSE2
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+#define HAS_SCALEARGBCOLS_SSE2
+#define HAS_SCALEARGBFILTERCOLS_SSSE3
+#define HAS_SCALEARGBCOLSUP2_SSE2
+#endif
+
+// The following are available on Neon platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEROWDOWN2_NEON
+#define HAS_SCALEROWDOWN4_NEON
+#define HAS_SCALEROWDOWN34_NEON
+#define HAS_SCALEROWDOWN38_NEON
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+#define HAS_SCALEARGBROWDOWN2_NEON
+#endif
+
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+ defined(__mips__)
+#define HAS_SCALEROWDOWN2_MIPS_DSPR2
+#define HAS_SCALEROWDOWN4_MIPS_DSPR2
+#define HAS_SCALEROWDOWN34_MIPS_DSPR2
+#define HAS_SCALEROWDOWN38_MIPS_DSPR2
+#endif
+
+// Scale ARGB vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int y, int dy,
+ int bpp, FilterMode filtering);
+
+// Simplify the filtering based on scale factors.
+FilterMode ScaleFilterReduce(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode filtering);
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode filtering,
+ int* x, int* y, int* dx, int* dy);
+
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int, int);
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height);
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int, int);
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height);
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx);
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int /* x */, int /* dx */);
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx);
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int /* x */, int /* dx */);
+// Row functions.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+
+// ScaleRowDown2Box also used by planar functions
+// NEON downscalers with interpolation.
+
+// Note - not static due to reuse in convert for 444 to 420.
+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+
+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+
+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+// Down scale from 4 to 3 pixels. Use the neon multilane read/write
+// to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+void ScaleRowDown34_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+// 32 -> 12
+void ScaleRowDown38_NEON(const uint8* src_ptr,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_ptr, int dst_width);
+// 32x3 -> 12x1
+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+// 32x2 -> 12x1
+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width);
+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width);
+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT
diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h
index 31cf78fc591..3bb834f9448 100644
--- a/chromium/third_party/libyuv/include/libyuv/version.h
+++ b/chromium/third_party/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 723
+#define LIBYUV_VERSION 911
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/chromium/third_party/libyuv/libyuv.gyp b/chromium/third_party/libyuv/libyuv.gyp
index ad6b78b5c3e..4130bd0d3f0 100644
--- a/chromium/third_party/libyuv/libyuv.gyp
+++ b/chromium/third_party/libyuv/libyuv.gyp
@@ -74,6 +74,7 @@
'include/libyuv/row.h',
'include/libyuv/scale.h',
'include/libyuv/scale_argb.h',
+ 'include/libyuv/scale_row.h',
'include/libyuv/version.h',
'include/libyuv/video_common.h',
@@ -107,9 +108,11 @@
'source/row_win.cc',
'source/scale.cc',
'source/scale_argb.cc',
- 'source/scale_argb_neon.cc',
+ 'source/scale_common.cc',
'source/scale_mips.cc',
'source/scale_neon.cc',
+ 'source/scale_posix.cc',
+ 'source/scale_win.cc',
'source/video_common.cc',
],
},
diff --git a/chromium/third_party/libyuv/libyuv_test.gyp b/chromium/third_party/libyuv/libyuv_test.gyp
index 447881a4480..906fc5f8b0d 100644
--- a/chromium/third_party/libyuv/libyuv_test.gyp
+++ b/chromium/third_party/libyuv/libyuv_test.gyp
@@ -35,6 +35,7 @@
'unit_test/compare_test.cc',
'unit_test/convert_test.cc',
'unit_test/cpu_test.cc',
+ 'unit_test/math_test.cc',
'unit_test/planar_test.cc',
'unit_test/rotate_argb_test.cc',
'unit_test/rotate_test.cc',
diff --git a/chromium/third_party/libyuv/linux.mk b/chromium/third_party/libyuv/linux.mk
new file mode 100644
index 00000000000..5d12135a85d
--- /dev/null
+++ b/chromium/third_party/libyuv/linux.mk
@@ -0,0 +1,48 @@
+# This is a generic makefile for libyuv for gcc.
+# make -f linux.mk CC=clang++
+
+CC=g++
+CCFLAGS=-O2 -fomit-frame-pointer -Iinclude/
+
+LOCAL_OBJ_FILES := \
+ source/compare.o \
+ source/compare_common.o \
+ source/compare_posix.o \
+ source/convert.o \
+ source/convert_argb.o \
+ source/convert_from.o \
+ source/convert_from_argb.o \
+ source/convert_to_argb.o \
+ source/convert_to_i420.o \
+ source/cpu_id.o \
+ source/format_conversion.o \
+ source/planar_functions.o \
+ source/rotate.o \
+ source/rotate_argb.o \
+ source/rotate_mips.o \
+ source/row_any.o \
+ source/row_common.o \
+ source/row_mips.o \
+ source/row_posix.o \
+ source/scale.o \
+ source/scale_argb.o \
+ source/scale_common.o \
+ source/scale_mips.o \
+ source/scale_posix.o \
+ source/video_common.o
+
+.cc.o:
+ $(CC) -c $(CCFLAGS) $*.cc -o $*.o
+
+all: libyuv.a convert linux.mk
+
+libyuv.a: $(LOCAL_OBJ_FILES) linux.mk
+ $(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES)
+
+# A test utility that uses libyuv conversion.
+convert: util/convert.cc linux.mk
+ $(CC) $(CCFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a
+
+clean:
+ /bin/rm -f source/*.o *.ii *.s libyuv.a convert
+
diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc
index f8b358309e5..7d844ee08a6 100644
--- a/chromium/third_party/libyuv/source/compare.cc
+++ b/chromium/third_party/libyuv/source/compare.cc
@@ -30,12 +30,17 @@ extern "C" {
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
// This module is for Visual C x86
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
#define HAS_HASHDJB2_SSE41
-
uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+#if _MSC_VER >= 1700
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
#endif // HAS_HASHDJB2_SSE41
// hash seed of 5381 recommended.
@@ -47,6 +52,11 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
HashDjb2_SSE = HashDjb2_SSE41;
}
#endif
+#if defined(HAS_HASHDJB2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HashDjb2_SSE = HashDjb2_AVX2;
+ }
+#endif
const int kBlockSize = 1 << 15; // 32768;
while (count >= static_cast<uint64>(kBlockSize)) {
@@ -73,8 +83,8 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
#define HAS_SUMSQUAREERROR_NEON
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
#endif
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
- defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
#define HAS_SUMSQUAREERROR_SSE2
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
#endif
@@ -138,7 +148,9 @@ LIBYUV_API
uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
const uint8* src_b, int stride_b,
int width, int height) {
- if (stride_a == width && stride_b == width) {
+ // Coalesce rows.
+ if (stride_a == width &&
+ stride_b == width) {
return ComputeSumSquareError(src_a, src_b, width * height);
}
uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
diff --git a/chromium/third_party/libyuv/source/compare_common.cc b/chromium/third_party/libyuv/source/compare_common.cc
index ab587d08171..3e4c77a67fe 100644
--- a/chromium/third_party/libyuv/source/compare_common.cc
+++ b/chromium/third_party/libyuv/source/compare_common.cc
@@ -1,40 +1,40 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse = 0u;
- for (int i = 0; i < count; ++i) {
- int diff = src_a[i] - src_b[i];
- sse += static_cast<uint32>(diff * diff);
- }
- return sse;
-}
-
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
- uint32 hash = seed;
- for (int i = 0; i < count; ++i) {
- hash += (hash << 5) + src[i];
- }
- return hash;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse = 0u;
+ for (int i = 0; i < count; ++i) {
+ int diff = src_a[i] - src_b[i];
+ sse += static_cast<uint32>(diff * diff);
+ }
+ return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+ uint32 hash = seed;
+ for (int i = 0; i < count; ++i) {
+ hash += (hash << 5) + src[i];
+ }
+ return hash;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc
index a4e77750631..c377c163474 100644
--- a/chromium/third_party/libyuv/source/compare_neon.cc
+++ b/chromium/third_party/libyuv/source/compare_neon.cc
@@ -1,61 +1,61 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
- volatile uint32 sse;
- asm volatile (
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
-
- ".p2align 2 \n"
- "1: \n"
- "vld1.8 {q0}, [%0]! \n"
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
-
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
- return sse;
-}
-
-#endif // __ARM_NEON__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n"
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/compare_posix.cc b/chromium/third_party/libyuv/source/compare_posix.cc
index f24835d7714..1e0ba8fe156 100644
--- a/chromium/third_party/libyuv/source/compare_posix.cc
+++ b/chromium/third_party/libyuv/source/compare_posix.cc
@@ -1,164 +1,166 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse;
- asm volatile (
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm1 \n"
- "movdqa (%0,%1,1),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "jg 1b \n"
-
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
-
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=g"(sse) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
- return sse;
-}
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-#define HAS_HASHDJB2_SSE41
-CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-CONST uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-CONST uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-CONST uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-CONST uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- uint32 hash;
- asm volatile (
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "sub $0x10,%1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
- : "+r"(src), // %0
- "+r"(count), // %1
- "+rm"(seed), // %2
- "=g"(hash) // %3
- : "m"(kHash16x33), // %4
- "m"(kHashMul0), // %5
- "m"(kHashMul1), // %6
- "m"(kHashMul2), // %7
- "m"(kHashMul3) // %8
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
- return hash;
-}
-#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#else
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#endif
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile ( // NOLINT
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10, 1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ ); // NOLINT
+ return sse;
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ uint32 hash;
+ asm volatile ( // NOLINT
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "sub $0x10,%1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ ); // NOLINT
+ return hash;
+}
+#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/chromium/third_party/libyuv/source/compare_win.cc b/chromium/third_party/libyuv/source/compare_win.cc
index e576e85c192..99831651f5f 100644
--- a/chromium/third_party/libyuv/source/compare_win.cc
+++ b/chromium/third_party/libyuv/source/compare_win.cc
@@ -1,192 +1,232 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
- __asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
- pxor xmm0, xmm0
- pxor xmm5, xmm5
- sub edx, eax
-
- align 16
- wloop:
- movdqa xmm1, [eax]
- movdqa xmm2, [eax + edx]
- lea eax, [eax + 16]
- sub ecx, 16
- movdqa xmm3, xmm1 // abs trick
- psubusb xmm1, xmm2
- psubusb xmm2, xmm3
- por xmm1, xmm2
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm0, xmm1
- paddd xmm0, xmm2
- jg wloop
-
- pshufd xmm1, xmm0, 0xee
- paddd xmm0, xmm1
- pshufd xmm1, xmm0, 0x01
- paddd xmm0, xmm1
- movd eax, xmm0
- ret
- }
-}
-
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
- __asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
- vpxor ymm0, ymm0, ymm0 // sum
- vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
- sub edx, eax
-
- align 16
- wloop:
- vmovdqu ymm1, [eax]
- vmovdqu ymm2, [eax + edx]
- lea eax, [eax + 32]
- sub ecx, 32
- vpsubusb ymm3, ymm1, ymm2 // abs difference trick
- vpsubusb ymm2, ymm2, ymm1
- vpor ymm1, ymm2, ymm3
- vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
- vpunpckhbw ymm1, ymm1, ymm5
- vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
- vpmaddwd ymm1, ymm1, ymm1
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm0, ymm0, ymm2
- jg wloop
-
- vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
- vpaddd ymm0, ymm0, ymm1
- vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
- vpaddd ymm0, ymm0, ymm1
- vpermq ymm1, ymm0, 0x02 // high + low lane.
- vpaddd ymm0, ymm0, ymm1
- vmovd eax, xmm0
- vzeroupper
- ret
- }
-}
-#endif // _MSC_VER >= 1700
-
-#define HAS_HASHDJB2_SSE41
-static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-static const uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
-// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
-// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
-// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
-// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
- _asm _emit 0x40 _asm _emit reg
-
-__declspec(naked) __declspec(align(16))
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- __asm {
- mov eax, [esp + 4] // src
- mov ecx, [esp + 8] // count
- movd xmm0, [esp + 12] // seed
-
- pxor xmm7, xmm7 // constant 0 for unpck
- movdqa xmm6, kHash16x33
-
- align 16
- wloop:
- movdqu xmm1, [eax] // src[0-15]
- lea eax, [eax + 16]
- pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
- movdqa xmm5, kHashMul0
- movdqa xmm2, xmm1
- punpcklbw xmm2, xmm7 // src[0-7]
- movdqa xmm3, xmm2
- punpcklwd xmm3, xmm7 // src[0-3]
- pmulld(0xdd) // pmulld xmm3, xmm5
- movdqa xmm5, kHashMul1
- movdqa xmm4, xmm2
- punpckhwd xmm4, xmm7 // src[4-7]
- pmulld(0xe5) // pmulld xmm4, xmm5
- movdqa xmm5, kHashMul2
- punpckhbw xmm1, xmm7 // src[8-15]
- movdqa xmm2, xmm1
- punpcklwd xmm2, xmm7 // src[8-11]
- pmulld(0xd5) // pmulld xmm2, xmm5
- movdqa xmm5, kHashMul3
- punpckhwd xmm1, xmm7 // src[12-15]
- pmulld(0xcd) // pmulld xmm1, xmm5
- paddd xmm3, xmm4 // add 16 results
- paddd xmm1, xmm2
- sub ecx, 16
- paddd xmm1, xmm3
-
- pshufd xmm2, xmm1, 0x0e // upper 2 dwords
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0x01
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- jg wloop
-
- movd eax, xmm0 // return hash
- ret
- }
-}
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+
+ align 4
+ wloop:
+ movdqa xmm1, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm2, [edx]
+ lea edx, [edx + 16]
+ sub ecx, 16
+ movdqa xmm3, xmm1 // abs trick
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ jg wloop
+
+ pshufd xmm1, xmm0, 0xee
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 0x01
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ vpxor ymm0, ymm0, ymm0 // sum
+ vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
+ sub edx, eax
+
+ align 4
+ wloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + edx]
+ lea eax, [eax + 32]
+ sub ecx, 32
+ vpsubusb ymm3, ymm1, ymm2 // abs difference trick
+ vpsubusb ymm2, ymm2, ymm1
+ vpor ymm1, ymm2, ymm3
+ vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
+ vpunpckhbw ymm1, ymm1, ymm5
+ vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
+ vpmaddwd ymm1, ymm1, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm2
+ jg wloop
+
+ vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpermq ymm1, ymm0, 0x02 // high + low lane.
+ vpaddd ymm0, ymm0, ymm1
+ vmovd eax, xmm0
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
+// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
+// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
+// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
+// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+ _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+
+ pxor xmm7, xmm7 // constant 0 for unpck
+ movdqa xmm6, kHash16x33
+
+ align 4
+ wloop:
+ movdqu xmm1, [eax] // src[0-15]
+ lea eax, [eax + 16]
+ pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
+ movdqa xmm5, kHashMul0
+ movdqa xmm2, xmm1
+ punpcklbw xmm2, xmm7 // src[0-7]
+ movdqa xmm3, xmm2
+ punpcklwd xmm3, xmm7 // src[0-3]
+ pmulld(0xdd) // pmulld xmm3, xmm5
+ movdqa xmm5, kHashMul1
+ movdqa xmm4, xmm2
+ punpckhwd xmm4, xmm7 // src[4-7]
+ pmulld(0xe5) // pmulld xmm4, xmm5
+ movdqa xmm5, kHashMul2
+ punpckhbw xmm1, xmm7 // src[8-15]
+ movdqa xmm2, xmm1
+ punpcklwd xmm2, xmm7 // src[8-11]
+ pmulld(0xd5) // pmulld xmm2, xmm5
+ movdqa xmm5, kHashMul3
+ punpckhwd xmm1, xmm7 // src[12-15]
+ pmulld(0xcd) // pmulld xmm1, xmm5
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ sub ecx, 16
+ paddd xmm1, xmm3
+
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+ movdqa xmm6, kHash16x33
+
+ align 4
+ wloop:
+ vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
+ pmulld xmm0, xmm6 // hash *= 33 ^ 16
+ vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
+ pmulld xmm3, kHashMul0
+ vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
+ pmulld xmm4, kHashMul1
+ vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
+ pmulld xmm2, kHashMul2
+ lea eax, [eax + 16]
+ pmulld xmm1, kHashMul3
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ sub ecx, 16
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/convert.cc b/chromium/third_party/libyuv/source/convert.cc
index 980df7edd5e..9ec71058ce9 100644
--- a/chromium/third_party/libyuv/source/convert.cc
+++ b/chromium/third_party/libyuv/source/convert.cc
@@ -22,7 +22,43 @@ namespace libyuv {
extern "C" {
#endif
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_y_width, int src_y_height,
+ int src_uv_width, int src_uv_height) {
+ if (src_y_width == 0 || src_y_height == 0 ||
+ src_uv_width == 0 || src_uv_height == 0) {
+ return -1;
+ }
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+ dst_y, dst_stride_y, dst_y_width, dst_y_height,
+ kFilterBilinear);
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+ dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+ dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ return 0;
+}
+
// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
LIBYUV_API
int I420Copy(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -39,7 +75,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
// Negative height means invert the image.
if (height < 0) {
height = -height;
- int halfheight = (height + 1) >> 1;
+ const int halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (halfheight - 1) * src_stride_u;
src_v = src_v + (halfheight - 1) * src_stride_v;
@@ -48,16 +84,19 @@ int I420Copy(const uint8* src_y, int src_stride_y,
src_stride_v = -src_stride_v;
}
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
+ // Copy UV planes.
+ const int halfwidth = (width + 1) >> 1;
+ const int halfheight = (height + 1) >> 1;
CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
return 0;
}
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
int I422ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -66,88 +105,19 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int halfwidth = (width + 1) >> 1;
- void (*HalfRow)(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) = HalfRow_C;
-#if defined(HAS_HALFROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) &&
- IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- HalfRow = HalfRow_SSE2;
- }
-#endif
-#if defined(HAS_HALFROW_AVX2)
- if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
- HalfRow = HalfRow_AVX2;
- }
-#endif
-#if defined(HAS_HALFROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) {
- HalfRow = HalfRow_NEON;
- }
-#endif
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // SubSample U plane.
- int y;
- for (y = 0; y < height - 1; y += 2) {
- HalfRow(src_u, src_stride_u, dst_u, halfwidth);
- src_u += src_stride_u * 2;
- dst_u += dst_stride_u;
- }
- if (height & 1) {
- HalfRow(src_u, 0, dst_u, halfwidth);
- }
-
- // SubSample V plane.
- for (y = 0; y < height - 1; y += 2) {
- HalfRow(src_v, src_stride_v, dst_v, halfwidth);
- src_v += src_stride_v * 2;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- HalfRow(src_v, 0, dst_v, halfwidth);
- }
- return 0;
+ const int src_uv_width = SUBSAMPLE(width, 1, 1);
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ src_uv_width, height);
}
-// Blends 32x2 pixels to 16x1
-// source in scale.cc
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEROWDOWN2_NEON
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-#elif !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-#endif
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
int I444ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -156,69 +126,16 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int halfwidth = (width + 1) >> 1;
- void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
-#if defined(HAS_SCALEROWDOWN2_NEON)
- if (TestCpuFlag(kCpuHasNEON) &&
- IS_ALIGNED(halfwidth, 16)) {
- ScaleRowDown2 = ScaleRowDown2Box_NEON;
- }
-#elif defined(HAS_SCALEROWDOWN2_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) &&
- IS_ALIGNED(halfwidth, 16) &&
- IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- ScaleRowDown2 = ScaleRowDown2Box_SSE2;
- }
-#endif
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // SubSample U plane.
- int y;
- for (y = 0; y < height - 1; y += 2) {
- ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth);
- src_u += src_stride_u * 2;
- dst_u += dst_stride_u;
- }
- if (height & 1) {
- ScaleRowDown2(src_u, 0, dst_u, halfwidth);
- }
-
- // SubSample V plane.
- for (y = 0; y < height - 1; y += 2) {
- ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth);
- src_v += src_stride_v * 2;
- dst_v += dst_stride_v;
- }
- if (height & 1) {
- ScaleRowDown2(src_v, 0, dst_v, halfwidth);
- }
- return 0;
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ width, height);
}
-// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
// 411 chroma is 1/4 width, 1x height
// 420 chroma is 1/2 width, 1/2 height
LIBYUV_API
@@ -229,41 +146,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- src_y = src_y + (height - 1) * src_stride_y;
- src_u = src_u + (height - 1) * src_stride_u;
- src_v = src_v + (height - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- int quarterwidth = (width + 3) >> 2;
-
- // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height.
- ScalePlane(src_u, src_stride_u, quarterwidth, height,
- dst_u, dst_stride_u, halfwidth, halfheight,
- kFilterNone);
-
- // Resample V plane.
- ScalePlane(src_v, src_stride_v, quarterwidth, height,
- dst_v, dst_stride_v, halfwidth, halfheight,
- kFilterNone);
- return 0;
+ const int src_uv_width = SUBSAMPLE(width, 3, 2);
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ src_uv_width, height);
}
// I400 is greyscale typically used in MJPG
@@ -309,7 +200,6 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
}
#endif
#if defined(HAS_COPYROW_ERMS)
- // TODO(fbarchard): Detect Fast String support.
if (TestCpuFlag(kCpuHasERMS)) {
CopyRow = CopyRow_ERMS;
}
@@ -369,20 +259,23 @@ static int X420ToI420(const uint8* src_y,
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (src_stride_y0 == width &&
src_stride_y1 == width &&
dst_stride_y == width) {
- width = width * height;
+ width *= height;
height = 1;
+ src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
}
- if (src_stride_uv == width &&
- dst_stride_u * 2 == width &&
- dst_stride_v * 2 == width) {
- halfwidth = halfwidth * halfheight;
+ // Coalesce rows.
+ if (src_stride_uv == halfwidth * 2 &&
+ dst_stride_u == halfwidth &&
+ dst_stride_v == halfwidth) {
+ halfwidth *= halfheight;
halfheight = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
}
void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
SplitUVRow_C;
@@ -782,7 +675,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -798,7 +691,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
@@ -1044,7 +937,7 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
}
// Convert RGB24 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
@@ -1147,7 +1040,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
}
// Convert RAW to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int RAWToI420(const uint8* src_raw, int src_stride_raw,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
@@ -1250,7 +1143,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
}
// Convert RGB565 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
@@ -1353,7 +1246,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
}
// Convert ARGB1555 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
@@ -1458,7 +1351,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
}
// Convert ARGB4444 to I420.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
diff --git a/chromium/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libyuv/source/convert_argb.cc
index 55d4d6904ce..0a503361d8b 100644
--- a/chromium/third_party/libyuv/source/convert_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_argb.cc
@@ -63,16 +63,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u == width &&
src_stride_v == width &&
dst_stride_argb == width * 4) {
- return I444ToARGB(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
void (*I444ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
@@ -126,16 +124,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_argb == width * 4) {
- return I422ToARGB(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
void (*I422ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
@@ -207,16 +203,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 4 == width &&
src_stride_v * 4 == width &&
dst_stride_argb == width * 4) {
- return I411ToARGB(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
}
void (*I411ToARGBRow)(const uint8* y_buf,
const uint8* u_buf,
@@ -267,12 +261,12 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
dst_stride_argb == width * 4) {
- return I400ToARGB_Reference(src_y, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
}
void (*YToARGBRow)(const uint8* y_buf,
uint8* rgb_buf,
@@ -317,12 +311,12 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
src_y = src_y + (height - 1) * src_stride_y;
src_stride_y = -src_stride_y;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
dst_stride_argb == width * 4) {
- return I400ToARGB(src_y, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
}
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
I400ToARGBRow_C;
@@ -353,17 +347,17 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
}
// Shuffle table for converting BGRA to ARGB.
-static const uvec8 kShuffleMaskBGRAToARGB = {
+static uvec8 kShuffleMaskBGRAToARGB = {
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
};
// Shuffle table for converting ABGR to ARGB.
-static const uvec8 kShuffleMaskABGRToARGB = {
+static uvec8 kShuffleMaskABGRToARGB = {
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
};
// Shuffle table for converting RGBA to ARGB.
-static const uvec8 kShuffleMaskRGBAToARGB = {
+static uvec8 kShuffleMaskRGBAToARGB = {
1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
};
@@ -415,12 +409,12 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
src_stride_rgb24 = -src_stride_rgb24;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_rgb24 == width * 3 &&
dst_stride_argb == width * 4) {
- return RGB24ToARGB(src_rgb24, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_argb = 0;
}
void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RGB24ToARGBRow_C;
@@ -464,12 +458,12 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
src_raw = src_raw + (height - 1) * src_stride_raw;
src_stride_raw = -src_stride_raw;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_raw == width * 3 &&
dst_stride_argb == width * 4) {
- return RAWToARGB(src_raw, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_argb = 0;
}
void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
RAWToARGBRow_C;
@@ -513,12 +507,12 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
src_stride_rgb565 = -src_stride_rgb565;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_rgb565 == width * 2 &&
dst_stride_argb == width * 4) {
- return RGB565ToARGB(src_rgb565, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_rgb565 = dst_stride_argb = 0;
}
void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
RGB565ToARGBRow_C;
@@ -562,12 +556,12 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
src_stride_argb1555 = -src_stride_argb1555;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb1555 == width * 2 &&
dst_stride_argb == width * 4) {
- return ARGB1555ToARGB(src_argb1555, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb1555 = dst_stride_argb = 0;
}
void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
int pix) = ARGB1555ToARGBRow_C;
@@ -611,12 +605,12 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
src_stride_argb4444 = -src_stride_argb4444;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb4444 == width * 2 &&
dst_stride_argb == width * 4) {
- return ARGB4444ToARGB(src_argb4444, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb4444 = dst_stride_argb = 0;
}
void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
int pix) = ARGB4444ToARGBRow_C;
@@ -812,13 +806,13 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (width * height <= kMaxStride &&
src_stride_yuy2 == width * 2 &&
dst_stride_argb == width * 4) {
- return YUY2ToARGB(src_yuy2, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_argb = 0;
}
void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
YUY2ToARGBRow_C;
@@ -865,13 +859,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (width * height <= kMaxStride &&
src_stride_uyvy == width * 2 &&
dst_stride_argb == width * 4) {
- return UYVYToARGB(src_uyvy, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_argb = 0;
}
void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
UYVYToARGBRow_C;
diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc
index 87f9b5cb726..dc708de5e0b 100644
--- a/chromium/third_party/libyuv/source/convert_from.cc
+++ b/chromium/third_party/libyuv/source/convert_from.cc
@@ -25,6 +25,42 @@ namespace libyuv {
extern "C" {
#endif
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_y_width, int src_y_height,
+ int dst_uv_width, int dst_uv_height) {
+ if (src_y_width == 0 || src_y_height == 0 ||
+ dst_uv_width <= 0 || dst_uv_height <= 0) {
+ return -1;
+ }
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+ const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+ dst_y, dst_stride_y, dst_y_width, dst_y_height,
+ kFilterBilinear);
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+ dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+ dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
LIBYUV_API
int I420ToI422(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -33,84 +69,20 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (height - 1) * dst_stride_u;
- dst_v = dst_v + (height - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
- int halfwidth = (width + 1) >> 1;
- void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
-#if defined(HAS_COPYROW_X86)
- if (IS_ALIGNED(halfwidth, 4)) {
- CopyRow = CopyRow_X86;
- }
-#endif
-#if defined(HAS_COPYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) &&
- IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
- IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
- IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
- IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
- CopyRow = CopyRow_SSE2;
- }
-#endif
-#if defined(HAS_COPYROW_ERMS)
- if (TestCpuFlag(kCpuHasERMS)) {
- CopyRow = CopyRow_ERMS;
- }
-#endif
-#if defined(HAS_COPYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) {
- CopyRow = CopyRow_NEON;
- }
-#endif
-#if defined(HAS_COPYROW_MIPS)
- if (TestCpuFlag(kCpuHasMIPS)) {
- CopyRow = CopyRow_MIPS;
- }
-#endif
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- // UpSample U plane.
- int y;
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src_u, dst_u, halfwidth);
- CopyRow(src_u, dst_u + dst_stride_u, halfwidth);
- src_u += src_stride_u;
- dst_u += dst_stride_u * 2;
- }
- if (height & 1) {
- CopyRow(src_u, dst_u, halfwidth);
- }
-
- // UpSample V plane.
- for (y = 0; y < height - 1; y += 2) {
- CopyRow(src_v, dst_v, halfwidth);
- CopyRow(src_v, dst_v + dst_stride_v, halfwidth);
- src_v += src_stride_v;
- dst_v += dst_stride_v * 2;
- }
- if (height & 1) {
- CopyRow(src_v, dst_v, halfwidth);
- }
- return 0;
+ const int dst_uv_width = (Abs(width) + 1) >> 1;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
}
-// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
LIBYUV_API
int I420ToI444(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
@@ -119,40 +91,16 @@ int I420ToI444(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u|| !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (height - 1) * dst_stride_u;
- dst_v = dst_v + (height - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
-
- // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height.
- ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
- dst_u, dst_stride_u, width, height,
- kFilterNone);
-
- // Upsample V plane.
- ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
- dst_v, dst_stride_v, width, height,
- kFilterNone);
- return 0;
+ const int dst_uv_width = Abs(width);
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
}
// 420 chroma is 1/2 width, 1/2 height
@@ -165,41 +113,16 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
- if (!src_y || !src_u || !src_v ||
- !dst_y || !dst_u || !dst_v ||
- width <= 0 || height == 0) {
- return -1;
- }
- // Negative height means invert the image.
- if (height < 0) {
- height = -height;
- dst_y = dst_y + (height - 1) * dst_stride_y;
- dst_u = dst_u + (height - 1) * dst_stride_u;
- dst_v = dst_v + (height - 1) * dst_stride_v;
- dst_stride_y = -dst_stride_y;
- dst_stride_u = -dst_stride_u;
- dst_stride_v = -dst_stride_v;
- }
-
- // Copy Y plane
- if (dst_y) {
- CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
- }
-
- int halfwidth = (width + 1) >> 1;
- int halfheight = (height + 1) >> 1;
- int quarterwidth = (width + 3) >> 2;
-
- // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height
- ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
- dst_u, dst_stride_u, quarterwidth, height,
- kFilterNone);
-
- // Resample V plane.
- ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
- dst_v, dst_stride_v, quarterwidth, height,
- kFilterNone);
- return 0;
+ const int dst_uv_width = (Abs(width) + 3) >> 2;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
}
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
@@ -237,16 +160,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
dst_stride_yuy2 = -dst_stride_yuy2;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_yuy2 == width * 2) {
- return I422ToYUY2(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_yuy2, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
}
void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_yuy2, int width) =
@@ -343,16 +264,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
dst_stride_uyvy = -dst_stride_uyvy;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_uyvy == width * 2) {
- return I422ToUYVY(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_uyvy, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
}
void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
const uint8* src_v, uint8* dst_uyvy, int width) =
@@ -453,19 +372,22 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
dst_stride_y = -dst_stride_y;
dst_stride_uv = -dst_stride_uv;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (src_stride_y == width &&
dst_stride_y == width) {
- width = width * height;
+ width *= height;
height = 1;
+ src_stride_y = dst_stride_y = 0;
}
- if (src_stride_u * 2 == width &&
- src_stride_v * 2 == width &&
- dst_stride_uv == width) {
- halfwidth = halfwidth * halfheight;
+ // Coalesce rows.
+ if (src_stride_u == halfwidth &&
+ src_stride_v == halfwidth &&
+ dst_stride_uv == halfwidth * 2) {
+ halfwidth *= halfheight;
halfheight = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
}
void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) = MergeUVRow_C;
diff --git a/chromium/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libyuv/source/convert_from_argb.cc
index 418f44d0cf5..9d5752cbb09 100644
--- a/chromium/third_party/libyuv/source/convert_from_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_from_argb.cc
@@ -36,32 +36,30 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width &&
dst_stride_u == width &&
dst_stride_v == width) {
- return ARGBToI444(src_argb, 0,
- dst_y, 0,
- dst_u, 0,
- dst_v, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV444Row_C;
-#if defined(HAS_ARGBTOUV444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_SSSE3;
- }
- }
- }
-#endif
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
+ }
+ }
+#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -111,16 +109,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
- return ARGBToI422(src_argb, 0,
- dst_y, 0,
- dst_u, 0,
- dst_v, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV422Row_C;
@@ -190,16 +186,14 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width &&
dst_stride_u * 4 == width &&
dst_stride_v * 4 == width) {
- return ARGBToI411(src_argb, 0,
- dst_y, 0,
- dst_u, 0,
- dst_v, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV411Row_C;
@@ -251,7 +245,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
return 0;
}
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
@@ -272,7 +266,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -353,7 +347,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
}
// Same as NV12 but U and V swapped.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
uint8* dst_uv, int dst_stride_uv,
@@ -374,7 +368,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -455,7 +449,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
}
// Convert ARGB to YUY2.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
uint8* dst_yuy2, int dst_stride_yuy2,
int width, int height) {
@@ -470,13 +464,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
dst_stride_yuy2 = -dst_stride_yuy2;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (width * height <= kMaxStride &&
src_stride_argb == width * 4 &&
dst_stride_yuy2 == width * 2) {
- return ARGBToYUY2(src_argb, 0,
- dst_yuy2, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yuy2 = 0;
}
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV422Row_C;
@@ -551,7 +545,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
}
// Convert ARGB to UYVY.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
uint8* dst_uyvy, int dst_stride_uyvy,
int width, int height) {
@@ -566,13 +560,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
dst_stride_uyvy = -dst_stride_uyvy;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (width * height <= kMaxStride &&
src_stride_argb == width * 4 &&
dst_stride_uyvy == width * 2) {
- return ARGBToUYVY(src_argb, 0,
- dst_uyvy, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_uyvy = 0;
}
void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
int pix) = ARGBToUV422Row_C;
@@ -659,12 +653,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_y == width) {
- return ARGBToI400(src_argb, 0,
- dst_y, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = 0;
}
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
@@ -706,7 +700,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
}
// Shuffle table for converting ARGB to RGBA.
-static const uvec8 kShuffleMaskARGBToRGBA = {
+static uvec8 kShuffleMaskARGBToRGBA = {
3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
};
@@ -734,19 +728,17 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_rgb24 == width * 3) {
- return ARGBToRGB24(src_argb, 0,
- dst_rgb24, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb24 = 0;
}
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB24Row_C;
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
@@ -782,19 +774,17 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_raw == width * 3) {
- return ARGBToRAW(src_argb, 0,
- dst_raw, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_raw = 0;
}
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRAWRow_C;
#if defined(HAS_ARGBTORAWROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
@@ -830,12 +820,12 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_rgb565 == width * 2) {
- return ARGBToRGB565(src_argb, 0,
- dst_rgb565, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb565 = 0;
}
void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToRGB565Row_C;
@@ -877,12 +867,12 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb1555 == width * 2) {
- return ARGBToARGB1555(src_argb, 0,
- dst_argb1555, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb1555 = 0;
}
void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToARGB1555Row_C;
@@ -924,12 +914,12 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb4444 == width * 2) {
- return ARGBToARGB4444(src_argb, 0,
- dst_argb4444, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb4444 = 0;
}
void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
ARGBToARGB4444Row_C;
@@ -980,7 +970,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C;
-#if defined(HAS_ARGBTOYJROW_SSSE3)
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
@@ -996,7 +986,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
}
}
#endif
-#if defined(HAS_ARGBTOYJROW_AVX2)
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
@@ -1048,12 +1038,12 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_yj == width) {
- return ARGBToJ400(src_argb, 0,
- dst_yj, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = 0;
}
void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
ARGBToYJRow_C;
diff --git a/chromium/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libyuv/source/convert_to_argb.cc
index 95b6386d719..aa6185661cd 100644
--- a/chromium/third_party/libyuv/source/convert_to_argb.cc
+++ b/chromium/third_party/libyuv/source/convert_to_argb.cc
@@ -61,15 +61,15 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample;
uint8* tmp_argb = dst_argb;
int tmp_argb_stride = argb_stride;
- uint8* buf = NULL;
+ uint8* rotate_buffer = NULL;
int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
if (need_buf) {
int argb_size = dst_width * abs_dst_height * 4;
- buf = new uint8[argb_size];
- if (!buf) {
+ rotate_buffer = new uint8[argb_size];
+ if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
- dst_argb = buf;
+ dst_argb = rotate_buffer;
argb_stride = dst_width;
}
@@ -312,7 +312,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
tmp_argb, tmp_argb_stride,
dst_width, abs_dst_height, rotation);
}
- delete buf;
+ delete [] rotate_buffer;
}
return r;
diff --git a/chromium/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libyuv/source/convert_to_i420.cc
index 763eb50920e..5683ffe43ab 100644
--- a/chromium/third_party/libyuv/source/convert_to_i420.cc
+++ b/chromium/third_party/libyuv/source/convert_to_i420.cc
@@ -68,16 +68,16 @@ int ConvertToI420(const uint8* sample,
int tmp_y_stride = y_stride;
int tmp_u_stride = u_stride;
int tmp_v_stride = v_stride;
- uint8* buf = NULL;
+ uint8* rotate_buffer = NULL;
int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height;
if (need_buf) {
int y_size = dst_width * abs_dst_height;
int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2);
- buf = new uint8[y_size + uv_size * 2];
- if (!buf) {
+ rotate_buffer = new uint8[y_size + uv_size * 2];
+ if (!rotate_buffer) {
return 1; // Out of memory runtime error.
}
- y = buf;
+ y = rotate_buffer;
u = y + y_size;
v = u + uv_size;
y_stride = dst_width;
@@ -372,7 +372,7 @@ int ConvertToI420(const uint8* sample,
tmp_v, tmp_v_stride,
dst_width, abs_dst_height, rotation);
}
- delete buf;
+ delete [] rotate_buffer;
}
return r;
diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc
index b4c993a2740..c4f840abb14 100644
--- a/chromium/third_party/libyuv/source/cpu_id.cc
+++ b/chromium/third_party/libyuv/source/cpu_id.cc
@@ -11,14 +11,16 @@
#include "libyuv/cpu_id.h"
#ifdef _MSC_VER
-#include <intrin.h> // For __cpuid()
+#include <intrin.h> // For __cpuidex()
#endif
-#if !defined(__CLR_VER) && defined(_M_X64) && \
+#if !defined(__CLR_VER) && !defined(__native_client__) && defined(_M_X64) && \
defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
#include <immintrin.h> // For _xgetbv()
#endif
+#if !defined(__native_client__)
#include <stdlib.h> // For getenv()
+#endif
// For ArmCpuCaps() but unittested on all platforms
#include <stdio.h>
@@ -26,92 +28,102 @@
#include "libyuv/basic_types.h" // For CPU_X86
-// TODO(fbarchard): Consider cpu functionality for breakpoints, timer and cache.
-// arm - bkpt vs intel int 3
-
-// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
-#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile ( // NOLINT
- "mov %%ebx, %%edi \n"
- "cpuid \n"
- "xchg %%edi, %%ebx \n"
- : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
-}
-#elif defined(__i386__) || defined(__x86_64__)
-static __inline void __cpuid(int cpu_info[4], int info_type) {
- asm volatile ( // NOLINT
- "cpuid \n"
- : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
- : "a"(info_type));
-}
-#endif
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+// For functions that use rowbuffer and have runtime checks for overflow,
+// use SAFEBUFFERS to avoid additional check.
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+#define SAFEBUFFERS __declspec(safebuffers)
+#else
+#define SAFEBUFFERS
+#endif
+
// Low level cpuid for X86. Returns zeros on other CPUs.
#if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \
defined(__i386__) || defined(__x86_64__))
LIBYUV_API
-void CpuId(int cpu_info[4], int info_type) {
- __cpuid(cpu_info, info_type);
-}
+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+#if defined(_MSC_VER)
+#if (_MSC_FULL_VER >= 160040219)
+ __cpuidex(reinterpret_cast<int*>(cpu_info), info_eax, info_ecx);
+#elif defined(_M_IX86)
+ __asm {
+ mov eax, info_eax
+ mov ecx, info_ecx
+ mov edi, cpu_info
+ cpuid
+ mov [edi], eax
+ mov [edi + 4], ebx
+ mov [edi + 8], ecx
+ mov [edi + 12], edx
+ }
#else
-LIBYUV_API
-void CpuId(int cpu_info[4], int) {
- cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
-}
+ if (info_ecx == 0) {
+ __cpuid(reinterpret_cast<int*>(cpu_info), info_eax);
+ } else {
+ cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+ }
#endif
-
-// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
-#if !defined(__CLR_VER) && defined(_M_X64) && \
- defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
- return static_cast<uint32>(_xgetbv(xcr));
+#else // defined(_MSC_VER)
+ uint32 info_ebx, info_edx;
+ asm volatile ( // NOLINT
+#if defined( __i386__) && defined(__PIC__)
+ // Preserve ebx for fpic 32 bit.
+ "mov %%ebx, %%edi \n"
+ "cpuid \n"
+ "xchg %%edi, %%ebx \n"
+ : "=D" (info_ebx),
+#else
+ "cpuid \n"
+ : "=b" (info_ebx),
+#endif // defined( __i386__) && defined(__PIC__)
+ "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+ cpu_info[0] = info_eax;
+ cpu_info[1] = info_ebx;
+ cpu_info[2] = info_ecx;
+ cpu_info[3] = info_edx;
+#endif // defined(_MSC_VER)
}
-#elif !defined(__CLR_VER) && defined(_M_IX86) && defined(_MSC_VER)
+
+#if !defined(__native_client__)
#define HAS_XGETBV
-__declspec(naked) __declspec(align(16))
-static uint32 XGetBV(unsigned int xcr) {
+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
+int TestOsSaveYmm() {
+ uint32 xcr0 = 0u;
+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)
+ xcr0 = static_cast<uint32>(_xgetbv(0)); // VS2010 SP1 required.
+#elif defined(_M_IX86)
__asm {
- mov ecx, [esp + 4] // xcr
- push edx
- _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // xgetbv for vs2005.
- pop edx
- ret
+ xor ecx, ecx // xcr 0
+ _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier.
+ mov xcr0, eax
}
-}
#elif defined(__i386__) || defined(__x86_64__)
-#define HAS_XGETBV
-static uint32 XGetBV(unsigned int xcr) {
- uint32 xcr_feature_mask;
- asm volatile ( // NOLINT
- ".byte 0x0f, 0x01, 0xd0\n"
- : "=a"(xcr_feature_mask)
- : "c"(xcr)
- : "memory", "cc", "edx"); // edx unused.
- return xcr_feature_mask;
+ asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+#endif // defined(_MSC_VER)
+ return((xcr0 & 6) == 6); // Is ymm saved?
+}
+#endif // !defined(__native_client__)
+#else
+LIBYUV_API
+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+ cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
}
-#endif
-#ifdef HAS_XGETBV
-static const int kXCR_XFEATURE_ENABLED_MASK = 0;
#endif
// based on libvpx arm_cpudetect.c
// For Arm, but public to allow testing on any CPU
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int ArmCpuCaps(const char* cpuinfo_name) {
FILE* f = fopen(cpuinfo_name, "r");
if (f) {
- char buf[512];
- while (fgets(buf, 511, f)) {
- if (memcmp(buf, "Features", 8) == 0) {
- char* p = strstr(buf, " neon");
+ char cpuinfo_line[512];
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+ if (memcmp(cpuinfo_line, "Features", 8) == 0) {
+ char* p = strstr(cpuinfo_line, " neon");
if (p && (p[5] == ' ' || p[5] == '\n')) {
fclose(f);
return kCpuHasNEON;
@@ -129,7 +141,7 @@ static int MipsCpuCaps(const char* search_string) {
char cpuinfo_line[256];
FILE* f = NULL;
if ((f = fopen(file_name, "r")) != NULL) {
- while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) {
+ while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {
if (strstr(cpuinfo_line, search_string) != NULL) {
fclose(f);
return kCpuHasMIPS_DSP;
@@ -148,6 +160,8 @@ int cpu_info_ = kCpuInit; // cpu_info is not initialized yet.
// Test environment variable for disabling CPU features. Any non-zero value
// to disable. Zero ignored to make it easy to set the variable on/off.
+#if !defined(__native_client__) && !defined(_M_ARM)
+
static bool TestEnv(const char* name) {
const char* var = getenv(name);
if (var) {
@@ -157,23 +171,29 @@ static bool TestEnv(const char* name) {
}
return false;
}
+#else // nacl does not support getenv().
+static bool TestEnv(const char*) {
+ return false;
+}
+#endif
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
#if !defined(__CLR_VER) && defined(CPU_X86)
- int cpu_info1[4] = { 0, 0, 0, 0 };
- int cpu_info7[4] = { 0, 0, 0, 0 };
- __cpuid(cpu_info1, 1);
- __cpuid(cpu_info7, 7);
+ uint32 cpu_info1[4] = { 0, 0, 0, 0 };
+ uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+ CpuId(1, 0, cpu_info1);
+ CpuId(7, 0, cpu_info7);
cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
+ ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
#ifdef HAS_XGETBV
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
- (XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06) { // Saves YMM.
+ TestOsSaveYmm()) { // Saves YMM.
cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
kCpuHasAVX;
}
@@ -203,6 +223,9 @@ int InitCpuFlags(void) {
if (TestEnv("LIBYUV_DISABLE_ERMS")) {
cpu_info_ &= ~kCpuHasERMS;
}
+ if (TestEnv("LIBYUV_DISABLE_FMA3")) {
+ cpu_info_ &= ~kCpuHasFMA3;
+ }
#elif defined(__mips__) && defined(__linux__)
// Linux mips parse text file for dsp detect.
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
@@ -221,10 +244,11 @@ int InitCpuFlags(void) {
cpu_info_ &= ~kCpuHasMIPS_DSPR2;
}
#elif defined(__arm__)
-#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \
+ !defined(__native_client__)
// Linux arm parse text file for neon detect.
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
-#elif defined(__ARM_NEON__)
+#elif defined(__ARM_NEON__) || defined(__native_client__)
// gcc -mfpu=neon defines __ARM_NEON__
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
// to disable Neon on devices that do not have it.
diff --git a/chromium/third_party/libyuv/source/format_conversion.cc b/chromium/third_party/libyuv/source/format_conversion.cc
index 5b931b58773..cf7d6ea3af9 100644
--- a/chromium/third_party/libyuv/source/format_conversion.cc
+++ b/chromium/third_party/libyuv/source/format_conversion.cc
@@ -32,7 +32,7 @@ static int MakeSelectors(const int blue_index,
const int green_index,
const int red_index,
uint32 dst_fourcc_bayer,
- uint32 *index_map) {
+ uint32* index_map) {
// Now build a lookup table containing the indices for the four pixels in each
// 2x2 Bayer grid.
switch (dst_fourcc_bayer) {
@@ -280,7 +280,7 @@ int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
}
// Converts any Bayer RGB format to ARGB.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
@@ -310,7 +310,7 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
ARGBToYRow_C;
-#if defined(HAS_ARGBTOYROW_SSSE3)
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -380,7 +380,7 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
}
// Convert I420 to Bayer.
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
int I420ToBayer(const uint8* src_y, int src_stride_y,
const uint8* src_u, int src_stride_u,
const uint8* src_v, int src_stride_v,
diff --git a/chromium/third_party/libyuv/source/mjpeg_decoder.cc b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
index 5d7296d7e73..bd423200531 100644
--- a/chromium/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/chromium/third_party/libyuv/source/mjpeg_decoder.cc
@@ -420,9 +420,12 @@ void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) {
// recover from errors we use setjmp() as shown in their example. setjmp() is
// C's implementation for the "call with current continuation" functionality
// seen in some functional programming languages.
+ // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
char buf[JMSG_LENGTH_MAX];
(*cinfo->err->format_message)(cinfo, buf);
// ERROR: Error in jpeglib: buf
+#endif
SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
// This rewinds the call stack to the point of the corresponding setjmp()
diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc
index 2f70331327c..114faaef627 100644
--- a/chromium/third_party/libyuv/source/planar_functions.cc
+++ b/chromium/third_party/libyuv/source/planar_functions.cc
@@ -28,13 +28,12 @@ LIBYUV_API
void CopyPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height) {
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
dst_stride_y == width) {
- CopyPlane(src_y, 0,
- dst_y, 0,
- width * height, 1);
- return;
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
}
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
#if defined(HAS_COPYROW_X86)
@@ -173,10 +172,16 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
return 0;
}
-// Mirror a plane of data
+// Mirror a plane of data.
void MirrorPlane(const uint8* src_y, int src_stride_y,
uint8* dst_y, int dst_stride_y,
int width, int height) {
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
#if defined(HAS_MIRRORROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
@@ -222,16 +227,14 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_yuy2 == width * 2 &&
dst_stride_y == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
- return YUY2ToI422(src_yuy2, 0,
- dst_y, 0,
- dst_u, 0,
- dst_v, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
void (*YUY2ToUV422Row)(const uint8* src_yuy2,
uint8* dst_u, uint8* dst_v, int pix);
@@ -302,16 +305,14 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_uyvy == width * 2 &&
dst_stride_y == width &&
dst_stride_u * 2 == width &&
dst_stride_v * 2 == width) {
- return UYVYToI422(src_uyvy, 0,
- dst_y, 0,
- dst_u, 0,
- dst_v, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
void (*UYVYToUV422Row)(const uint8* src_uyvy,
uint8* dst_u, uint8* dst_v, int pix);
@@ -509,14 +510,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBBlend(src_argb0, 0,
- src_argb1, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
uint8* dst_argb, int width) = GetARGBBlend();
@@ -545,16 +545,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBMultiply(src_argb0, 0,
- src_argb1, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
-
void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBMultiplyRow_C;
#if defined(HAS_ARGBMULTIPLYROW_SSE2)
@@ -607,16 +605,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBAdd(src_argb0, 0,
- src_argb1, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
-
void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBAddRow_C;
#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)
@@ -674,16 +670,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBSubtract(src_argb0, 0,
- src_argb1, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
-
void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
int width) = ARGBSubtractRow_C;
#if defined(HAS_ARGBSUBTRACTROW_SSE2)
@@ -739,16 +733,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
dst_stride_bgra = -dst_stride_bgra;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_bgra == width * 4) {
- return I422ToBGRA(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_bgra, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;
}
void (*I422ToBGRARow)(const uint8* y_buf,
const uint8* u_buf,
@@ -810,16 +802,14 @@ int I422ToABGR(const uint8* src_y, int src_stride_y,
dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
dst_stride_abgr = -dst_stride_abgr;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_abgr == width * 4) {
- return I422ToABGR(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_abgr, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;
}
void (*I422ToABGRRow)(const uint8* y_buf,
const uint8* u_buf,
@@ -873,16 +863,14 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y,
dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
dst_stride_rgba = -dst_stride_rgba;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_y == width &&
src_stride_u * 2 == width &&
src_stride_v * 2 == width &&
dst_stride_rgba == width * 4) {
- return I422ToRGBA(src_y, 0,
- src_u, 0,
- src_v, 0,
- dst_rgba, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;
}
void (*I422ToRGBARow)(const uint8* y_buf,
const uint8* u_buf,
@@ -1016,12 +1004,11 @@ LIBYUV_API
void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height,
uint32 value) {
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (dst_stride_y == width) {
- SetPlane(dst_y, 0,
- width * height, 1,
- value);
- return;
+ width *= height;
+ height = 1;
+ dst_stride_y = 0;
}
void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
#if defined(HAS_SETROW_NEON)
@@ -1084,27 +1071,27 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
dst_x < 0 || dst_y < 0) {
return -1;
}
- // Coalesce contiguous rows.
+ dst_argb += dst_y * dst_stride_argb + dst_x * 4;
+ // Coalesce rows.
if (dst_stride_argb == width * 4) {
- return ARGBRect(dst_argb, dst_stride_argb,
- dst_x, dst_y,
- width * height, 1, value);
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
}
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
#if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- ARGBSetRows_NEON(dst, value, width, dst_stride_argb, height);
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);
return 0;
}
#endif
#if defined(HAS_SETROW_X86)
if (TestCpuFlag(kCpuHasX86)) {
- ARGBSetRows_X86(dst, value, width, dst_stride_argb, height);
+ ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);
return 0;
}
#endif
- ARGBSetRows_C(dst, value, width, dst_stride_argb, height);
+ ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
return 0;
}
@@ -1133,12 +1120,12 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBAttenuate(src_argb, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
}
void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBAttenuateRow_C;
@@ -1153,9 +1140,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
}
#endif
#if defined(HAS_ARGBATTENUATEROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
if (IS_ALIGNED(width, 4)) {
ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
@@ -1200,19 +1185,17 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBUnattenuate(src_argb, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
}
void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBUnattenuateRow_C;
#if defined(HAS_ARGBUNATTENUATEROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;
if (IS_ALIGNED(width, 4)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
@@ -1250,12 +1233,12 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBGrayTo(src_argb, 0,
- dst_argb, 0,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
}
void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBGrayRow_C;
@@ -1287,11 +1270,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (dst_stride_argb == width * 4) {
- return ARGBGray(dst_argb, dst_stride_argb,
- dst_x, dst_y,
- width * height, 1);
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
}
void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
int width) = ARGBGrayRow_C;
@@ -1320,11 +1303,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
return -1;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (dst_stride_argb == width * 4) {
- return ARGBSepia(dst_argb, dst_stride_argb,
- dst_x, dst_y,
- width * height, 1);
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
}
void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
#if defined(HAS_ARGBSEPIAROW_SSSE3)
@@ -1345,24 +1328,30 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
return 0;
}
-// Apply a 4x3 matrix rotation to each ARGB pixel.
+// Apply a 4x4 matrix to each ARGB pixel.
+// Note: Normally for shading, but can be used to swizzle or invert.
LIBYUV_API
-int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
const int8* matrix_argb,
- int dst_x, int dst_y, int width, int height) {
- if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 ||
- dst_x < 0 || dst_y < 0) {
+ int width, int height) {
+ if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
return -1;
}
- // Coalesce contiguous rows.
- if (dst_stride_argb == width * 4) {
- return ARGBColorMatrix(dst_argb, dst_stride_argb,
- matrix_argb,
- dst_x, dst_y,
- width * height, 1);
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
}
- void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb,
- int width) = ARGBColorMatrixRow_C;
+ void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
@@ -1373,14 +1362,48 @@ int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
}
#endif
- uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
for (int y = 0; y < height; ++y) {
- ARGBColorMatrixRow(dst, matrix_argb, width);
- dst += dst_stride_argb;
+ ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
}
return 0;
}
+// Apply a 4x3 matrix to each ARGB pixel.
+// Deprecated.
+LIBYUV_API SAFEBUFFERS
+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
+ const int8* matrix_rgb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+
+ // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.
+ SIMD_ALIGNED(int8 matrix_argb[16]);
+ matrix_argb[0] = matrix_rgb[0] / 2;
+ matrix_argb[1] = matrix_rgb[1] / 2;
+ matrix_argb[2] = matrix_rgb[2] / 2;
+ matrix_argb[3] = matrix_rgb[3] / 2;
+ matrix_argb[4] = matrix_rgb[4] / 2;
+ matrix_argb[5] = matrix_rgb[5] / 2;
+ matrix_argb[6] = matrix_rgb[6] / 2;
+ matrix_argb[7] = matrix_rgb[7] / 2;
+ matrix_argb[8] = matrix_rgb[8] / 2;
+ matrix_argb[9] = matrix_rgb[9] / 2;
+ matrix_argb[10] = matrix_rgb[10] / 2;
+ matrix_argb[11] = matrix_rgb[11] / 2;
+ matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
+ matrix_argb[15] = 64; // 1.0
+
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ return ARGBColorMatrix(const_cast<const uint8*>(dst), dst_stride_argb,
+ dst, dst_stride_argb,
+ &matrix_argb[0], width, height);
+}
+
// Apply a color table each ARGB pixel.
// Table contains 256 ARGB values.
LIBYUV_API
@@ -1391,12 +1414,11 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
dst_x < 0 || dst_y < 0) {
return -1;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (dst_stride_argb == width * 4) {
- return ARGBColorTable(dst_argb, dst_stride_argb,
- table_argb,
- dst_x, dst_y,
- width * height, 1);
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
}
void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
int width) = ARGBColorTableRow_C;
@@ -1413,6 +1435,37 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
return 0;
}
+// Apply a color table each ARGB pixel but preserve destination alpha.
+// Table contains 256 ARGB values.
+LIBYUV_API
+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
+ const uint8* table_argb,
+ int dst_x, int dst_y, int width, int height) {
+ if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
+ dst_x < 0 || dst_y < 0) {
+ return -1;
+ }
+ // Coalesce rows.
+ if (dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
+ }
+ void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+ int width) = RGBColorTableRow_C;
+#if defined(HAS_RGBCOLORTABLEROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ RGBColorTableRow = RGBColorTableRow_X86;
+ }
+#endif
+ uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+ for (int y = 0; y < height; ++y) {
+ RGBColorTableRow(dst, table_argb, width);
+ dst += dst_stride_argb;
+ }
+ return 0;
+}
+
// ARGBQuantize is used to posterize art.
// e.g. rgb / qvalue * qvalue + qvalue / 2
// But the low levels implement efficiently with 3 parameters, and could be
@@ -1430,12 +1483,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
interval_size < 1 || interval_size > 255) {
return -1;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (dst_stride_argb == width * 4) {
- return ARGBQuantize(dst_argb, dst_stride_argb,
- scale, interval_size, interval_offset,
- dst_x, dst_y,
- width * height, 1);
+ width *= height;
+ height = 1;
+ dst_stride_argb = 0;
}
void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) = ARGBQuantizeRow_C;
@@ -1496,14 +1548,28 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
return -1;
}
- void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ if (radius > height) {
+ radius = height;
+ }
+ if (radius > (width / 2 - 1)) {
+ radius = width / 2 - 1;
+ }
+ if (radius <= 0) {
+ return -1;
+ }
+ void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
- void (*CUMULATIVESUMTOAVERAGEROW)(const int32* topleft, const int32* botleft,
+ void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;
- CUMULATIVESUMTOAVERAGEROW = CumulativeSumToAverageRow_SSE2;
+ CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;
}
#endif
// Compute enough CumulativeSum for first row to be blurred. After this
@@ -1548,24 +1614,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
int boxwidth = radius * 4;
int x;
for (x = 0; x < radius + 1; ++x) {
- CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], 1);
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], 1);
area += (bot_y - top_y);
boxwidth += 4;
}
// Middle unclipped.
int n = (width - 1) - radius - x + 1;
- CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row,
- boxwidth, area, &dst_argb[x * 4], n);
+ CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
+ boxwidth, area, &dst_argb[x * 4], n);
// Right clipped.
for (x += n; x <= width - 1; ++x) {
area -= (bot_y - top_y);
boxwidth -= 4;
- CUMULATIVESUMTOAVERAGEROW(cumsum_top_row + (x - radius - 1) * 4,
- cumsum_bot_row + (x - radius - 1) * 4,
- boxwidth, area, &dst_argb[x * 4], 1);
+ CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
+ cumsum_bot_row + (x - radius - 1) * 4,
+ boxwidth, area, &dst_argb[x * 4], 1);
}
dst_argb += dst_stride_argb;
}
@@ -1585,13 +1651,12 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBShade(src_argb, 0,
- dst_argb, 0,
- width * height, 1,
- value);
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
}
void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
int width, uint32 value) = ARGBShadeRow_C;
@@ -1616,8 +1681,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
}
// Interpolate 2 ARGB images by specified amount (0 to 255).
-// TODO(fbarchard): Consider selecting a specialization for interpolation so
-// row function doesn't need to check interpolation on each row.
LIBYUV_API
int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
const uint8* src_argb1, int src_stride_argb1,
@@ -1632,15 +1695,13 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_argb0 == width * 4 &&
src_stride_argb1 == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBInterpolate(src_argb0, 0,
- src_argb1, 0,
- dst_argb, 0,
- width * height, 1,
- interpolation);
+ width *= height;
+ height = 1;
+ src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;
}
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
@@ -1671,6 +1732,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 4) {
InterpolateRow = InterpolateRow_Any_NEON;
@@ -1713,16 +1782,23 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
src_bgra = src_bgra + (height - 1) * src_stride_bgra;
src_stride_bgra = -src_stride_bgra;
}
- // Coalesce contiguous rows.
+ // Coalesce rows.
if (src_stride_bgra == width * 4 &&
dst_stride_argb == width * 4) {
- return ARGBShuffle(src_bgra, 0,
- dst_argb, 0,
- shuffler,
- width * height, 1);
+ width *= height;
+ height = 1;
+ src_stride_bgra = dst_stride_argb = 0;
}
void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
const uint8* shuffler, int pix) = ARGBShuffleRow_C;
+#if defined(HAS_ARGBSHUFFLEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {
+ ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBShuffleRow = ARGBShuffleRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBSHUFFLEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
@@ -1761,12 +1837,17 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
}
// Sobel ARGB effect.
-LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height) {
+static SAFEBUFFERS
+int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ void (*SobelRow)(const uint8* src_sobelx,
+ const uint8* src_sobely,
+ uint8* dst, int width)) {
+ const int kMaxRow = kMaxStride / 4;
+ const int kEdge = 16; // Extra pixels at start of row for extrude/align.
if (!src_argb || !dst_argb ||
- width <= 0 || height == 0 || width > (kMaxStride / 4)) {
+ width <= 0 || height == 0 || width > (kMaxRow - kEdge)) {
return -1;
}
// Negative height means invert the image.
@@ -1777,7 +1858,16 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
}
// ARGBToBayer used to select G channel from ARGB.
void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
+ uint32 selector, int pix) = ARGBToBayerGGRow_C;
+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
+ }
+ }
+#endif
#if defined(HAS_ARGBTOBAYERROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
@@ -1786,19 +1876,20 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
ARGBToBayerRow = ARGBToBayerRow_SSSE3;
}
}
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
+#endif
+#if defined(HAS_ARGBTOBAYERGGROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+ ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_NEON;
+ ARGBToBayerRow = ARGBToBayerGGRow_NEON;
}
}
#endif
void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) = SobelYRow_C;
-#if defined(HAS_SOBELYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- SobelYRow = SobelYRow_SSSE3;
+#if defined(HAS_SOBELYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelYRow = SobelYRow_SSE2;
}
#endif
#if defined(HAS_SOBELYROW_NEON)
@@ -1809,9 +1900,9 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobely, int width) =
SobelXRow_C;
-#if defined(HAS_SOBELXROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- SobelXRow = SobelXRow_SSSE3;
+#if defined(HAS_SOBELXROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXRow = SobelXRow_SSE2;
}
#endif
#if defined(HAS_SOBELXROW_NEON)
@@ -1819,35 +1910,22 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
SobelXRow = SobelXRow_NEON;
}
#endif
- void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) = SobelRow_C;
-#if defined(HAS_SOBELROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- SobelRow = SobelRow_SSE2;
- }
-#endif
-#if defined(HAS_SOBELROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- SobelRow = SobelRow_NEON;
- }
-#endif
-
- const int kEdge = 16; // Extra pixels at start of row for extrude/align.
- SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
- SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]);
- SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]);
+ // 3 rows with edges before/after.
+ SIMD_ALIGNED(uint8 row_y[kEdge + kMaxRow * 3]);
+ SIMD_ALIGNED(uint8 row_sobelx[kMaxRow]);
+ SIMD_ALIGNED(uint8 row_sobely[kMaxRow]);
// Convert first row.
uint8* row_y0 = row_y + kEdge;
- uint8* row_y1 = row_y0 + kMaxStride / 4;
- uint8* row_y2 = row_y1 + kMaxStride / 4;
+ uint8* row_y1 = row_y0 + kMaxRow;
+ uint8* row_y2 = row_y1 + kMaxRow;
ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
row_y0[-1] = row_y0[0];
- row_y0[width] = row_y0[width - 1];
+ memset(row_y0 + width, row_y0[width - 1], 16); // extrude 16 pixels.
ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
row_y1[-1] = row_y1[0];
- row_y1[width] = row_y1[width - 1];
+ memset(row_y1 + width, row_y1[width - 1], 16);
+ memset(row_y2 + width, 0, 16);
for (int y = 0; y < height; ++y) {
// Convert next row of ARGB to Y.
@@ -1873,14 +1951,80 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
return 0;
}
+// Sobel ARGB effect.
+LIBYUV_API
+int ARGBSobel(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelRow_C;
+#if defined(HAS_SOBELROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ SobelRow = SobelRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ SobelRow = SobelRow_NEON;
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelRow);
+}
+
+// Sobel ARGB effect with planar output.
+LIBYUV_API
+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ SobelToPlaneRow = SobelToPlaneRow_NEON;
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
+ width, height, SobelToPlaneRow);
+}
+
// SobelXY ARGB effect.
// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel.
LIBYUV_API
int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
uint8* dst_argb, int dst_stride_argb,
int width, int height) {
- if (!src_argb || !dst_argb ||
- width <= 0 || height == 0 || width > kMaxStride / 4) {
+ void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) = SobelXYRow_C;
+#if defined(HAS_SOBELXYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ SobelXYRow = SobelXYRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELXYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ SobelXYRow = SobelXYRow_NEON;
+ }
+#endif
+ return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width, height, SobelXYRow);
+}
+
+// Apply a 4x4 polynomial to each ARGB pixel.
+LIBYUV_API
+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const float* poly,
+ int width, int height) {
+ if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@@ -1889,99 +2033,156 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
- // ARGBToBayer used to select G channel from ARGB.
- void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
- uint32 selector, int pix) = ARGBToBayerRow_C;
-#if defined(HAS_ARGBTOBAYERROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
- IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_SSSE3;
- }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
}
-#elif defined(HAS_ARGBTOBAYERROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
- ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
- if (IS_ALIGNED(width, 8)) {
- ARGBToBayerRow = ARGBToBayerRow_NEON;
- }
+ void (*ARGBPolynomialRow)(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) = ARGBPolynomialRow_C;
+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_SSE2;
}
#endif
- void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) = SobelYRow_C;
-#if defined(HAS_SOBELYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- SobelYRow = SobelYRow_SSSE3;
+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&
+ IS_ALIGNED(width, 2)) {
+ ARGBPolynomialRow = ARGBPolynomialRow_AVX2;
}
#endif
-#if defined(HAS_SOBELYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SobelYRow = SobelYRow_NEON;
+ for (int y = 0; y < height; ++y) {
+ ARGBPolynomialRow(src_argb, dst_argb, poly, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Apply a lumacolortable to each ARGB pixel.
+LIBYUV_API
+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ const uint8* luma,
+ int width, int height) {
+ if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+ void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
+ int width, const uint8* luma, const uint32 lumacoeff) =
+ ARGBLumaColorTableRow_C;
+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {
+ ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;
}
#endif
- void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobely, int width) =
- SobelXRow_C;
-#if defined(HAS_SOBELXROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- SobelXRow = SobelXRow_SSSE3;
+ for (int y = 0; y < height; ++y) {
+ ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Copy Alpha from one ARGB image to another.
+LIBYUV_API
+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb = 0;
+ }
+ void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
+ ARGBCopyAlphaRow_C;
+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
+ IS_ALIGNED(width, 8)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;
}
#endif
-#if defined(HAS_SOBELXROW_NEON)
- if (TestCpuFlag(kCpuHasNEON)) {
- SobelXRow = SobelXRow_NEON;
+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+ ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;
}
#endif
- void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) = SobelXYRow_C;
-#if defined(HAS_SOBELXYROW_SSE2)
- if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
- IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
- SobelXYRow = SobelXYRow_SSE2;
+ for (int y = 0; y < height; ++y) {
+ ARGBCopyAlphaRow(src_argb, dst_argb, width);
+ src_argb += src_stride_argb;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Copy a planar Y channel to the alpha channel of a destination ARGB image.
+LIBYUV_API
+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_y || !dst_argb || width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+ void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
+ ARGBCopyYToAlphaRow_C;
+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&
+ IS_ALIGNED(width, 8)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;
}
#endif
-#if defined(HAS_SOBELXYROW_NEON)
- if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
- SobelXYRow = SobelXYRow_NEON;
+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {
+ ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;
}
#endif
-
- const int kEdge = 16; // Extra pixels at start of row for extrude/align.
- SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]);
- SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]);
- SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]);
-
- // Convert first row.
- uint8* row_y0 = row_y + kEdge;
- uint8* row_y1 = row_y0 + kMaxStride / 4;
- uint8* row_y2 = row_y1 + kMaxStride / 4;
- ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
- row_y0[-1] = row_y0[0];
- row_y0[width] = row_y0[width - 1];
- ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
- row_y1[-1] = row_y1[0];
- row_y1[width] = row_y1[width - 1];
-
for (int y = 0; y < height; ++y) {
- // Convert next row of ARGB to Y.
- if (y < (height - 1)) {
- src_argb += src_stride_argb;
- }
- ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
- row_y2[-1] = row_y2[0];
- row_y2[width] = row_y2[width - 1];
-
- SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);
- SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);
- SobelXYRow(row_sobelx, row_sobely, dst_argb, width);
-
- // Cycle thru circular queue of 3 row_y buffers.
- uint8* row_yt = row_y0;
- row_y0 = row_y1;
- row_y1 = row_y2;
- row_y2 = row_yt;
-
+ ARGBCopyYToAlphaRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
dst_argb += dst_stride_argb;
}
return 0;
diff --git a/chromium/third_party/libyuv/source/rotate.cc b/chromium/third_party/libyuv/source/rotate.cc
index c46650b4458..b99cde10891 100644
--- a/chromium/third_party/libyuv/source/rotate.cc
+++ b/chromium/third_party/libyuv/source/rotate.cc
@@ -41,7 +41,7 @@ extern "C" {
#endif
#endif
-#if !defined(LIBYUV_DISABLE_NEON) && \
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
@@ -57,7 +57,8 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
int width);
#endif // defined(__ARM_NEON__)
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+ defined(__mips__) && \
defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
@@ -72,7 +73,8 @@ void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
int width);
#endif // defined(__mips__)
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#if !defined(LIBYUV_DISABLE_X86) && \
+ defined(_M_IX86) && defined(_MSC_VER)
#define HAS_TRANSPOSE_WX8_SSSE3
__declspec(naked) __declspec(align(16))
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
@@ -89,7 +91,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
// Read in the data from the source pointer.
// First round of bit swap.
- align 16
+ align 4
convertloop:
movq xmm0, qword ptr [eax]
lea ebp, [eax + 8]
@@ -188,7 +190,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
mov [esp + 16], ecx
mov ecx, [ecx + 16 + 28] // w
- align 16
+ align 4
convertloop:
// Read in the data from the source pointer.
// First round of bit swap.
@@ -294,14 +296,15 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
ret
}
}
-#elif !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
+#elif !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
#define HAS_TRANSPOSE_WX8_SSSE3
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
uint8* dst, int dst_stride, int width) {
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
"movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n"
@@ -383,7 +386,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
);
}
-#if !defined(LIBYUV_DISABLE_X86) && defined (__i386__)
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
#define HAS_TRANSPOSE_UVWX8_SSE2
extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
uint8* dst_a, int dst_stride_a,
@@ -503,9 +506,16 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
"pop %edi \n"
"pop %esi \n"
"pop %ebx \n"
+#if defined(__native_client__)
+ "pop %ecx \n"
+ "and $0xffffffe0,%ecx \n"
+ "jmp *%ecx \n"
+#else
"ret \n"
+#endif
);
-#elif !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
+#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+ defined(__x86_64__)
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
@@ -513,7 +523,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%3),%%xmm1 \n"
@@ -654,7 +664,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
asm volatile (
// Read in the data from the source pointer.
// First round of bit swap.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
"movdqa (%0),%%xmm0 \n"
"movdqa (%0,%4),%%xmm1 \n"
@@ -857,7 +867,7 @@ void RotatePlane270(const uint8* src, int src_stride,
TransposePlane(src, src_stride, dst, dst_stride, width, height);
}
-LIBYUV_API
+LIBYUV_API SAFEBUFFERS
void RotatePlane180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
diff --git a/chromium/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libyuv/source/rotate_argb.cc
index 5fa0d7ea798..b95512783a0 100644
--- a/chromium/third_party/libyuv/source/rotate_argb.cc
+++ b/chromium/third_party/libyuv/source/rotate_argb.cc
@@ -22,14 +22,15 @@ extern "C" {
// ARGBScale has a function to copy pixels to a row, striding each source
// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
- defined(__x86_64__) || defined(__i386__))
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
+ (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
#define HAS_SCALEARGBROWDOWNEVEN_SSE2
void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
int src_stepx,
uint8* dst_ptr, int dst_width);
#endif
-#if !defined(LIBYUV_DISABLE_NEON) && \
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
#define HAS_SCALEARGBROWDOWNEVEN_NEON
void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
@@ -88,6 +89,7 @@ void ARGBRotate270(const uint8* src, int src_stride,
ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
}
+SAFEBUFFERS
void ARGBRotate180(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int width, int height) {
diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc
index ab07c169703..a59c4d5fde6 100644
--- a/chromium/third_party/libyuv/source/rotate_neon.cc
+++ b/chromium/third_party/libyuv/source/rotate_neon.cc
@@ -18,7 +18,7 @@ extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-static const uvec8 kVTbl4x4Transpose =
+static uvec8 kVTbl4x4Transpose =
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
void TransposeWx8_NEON(const uint8* src, int src_stride,
@@ -31,7 +31,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
"sub %4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
"mov r9, %0 \n"
@@ -184,7 +184,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
);
}
-static const uvec8 kVTbl4x4TransposeDi =
+static uvec8 kVTbl4x4TransposeDi =
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
void TransposeUVWx8_NEON(const uint8* src, int src_stride,
@@ -198,7 +198,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
"sub %6, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
"mov r9, %0 \n"
diff --git a/chromium/third_party/libyuv/source/row_any.cc b/chromium/third_party/libyuv/source/row_any.cc
index 72100d90e9d..90c6a3ff5f8 100644
--- a/chromium/third_party/libyuv/source/row_any.cc
+++ b/chromium/third_party/libyuv/source/row_any.cc
@@ -137,8 +137,12 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,
3, 4, 2)
RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
3, 4, 2)
+#endif
+#if defined(HAS_I400TOARGBROW_SSE2)
RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
7, 1, 4)
+#endif
+#if defined(HAS_YTOARGBROW_SSE2)
RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
7, 1, 4)
RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
@@ -195,6 +199,15 @@ BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,
BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
7, 4, 1)
#endif
+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
+ 7, 4, 1)
+#endif
+#if defined(HAS_ARGBTOBAYERGGROW_NEON)
+BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
+ 7, 4, 1)
+#endif
+
#undef BAYERANY
// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.
@@ -213,6 +226,8 @@ YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)
#endif
#ifdef HAS_ARGBTOYROW_SSSE3
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
+#endif
+#ifdef HAS_BGRATOYROW_SSSE3
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)
YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)
@@ -290,7 +305,7 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
width & MASK); \
}
-#ifdef HAS_ARGBTOYROW_AVX2
+#ifdef HAS_ARGBTOUVROW_AVX2
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)
UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)
UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)
@@ -468,6 +483,10 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,
dst_argb + n * BPP, shuffler, width & MASK); \
}
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,
+ ARGBShuffleRow_C, 4, 4, 3)
+#endif
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,
ARGBShuffleRow_C, 4, 4, 7)
@@ -495,6 +514,10 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
width & MASK, source_y_fraction); \
}
+#ifdef HAS_INTERPOLATEROW_AVX2
+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
+ InterpolateRow_C, 1, 1, 32)
+#endif
#ifdef HAS_INTERPOLATEROW_SSSE3
NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,
InterpolateRow_C, 1, 1, 15)
diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc
index badea440582..f961696f008 100644
--- a/chromium/third_party/libyuv/source/row_common.cc
+++ b/chromium/third_party/libyuv/source/row_common.cc
@@ -59,6 +59,11 @@ static __inline uint32 Abs(int32 v) {
}
#endif // USE_BRANCHLESS
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+ return static_cast<int>((static_cast<int64>(num) << 16) / div);
+}
+
#ifdef LIBYUV_LITTLE_ENDIAN
#define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
#else
@@ -649,21 +654,27 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
}
// Apply color matrix to a row of image. Matrix is signed.
-void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) {
+// TODO(fbarchard): Consider adding rounding (+32).
+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
for (int x = 0; x < width; ++x) {
- int b = dst_argb[0];
- int g = dst_argb[1];
- int r = dst_argb[2];
- int a = dst_argb[3];
+ int b = src_argb[0];
+ int g = src_argb[1];
+ int r = src_argb[2];
+ int a = src_argb[3];
int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
- r * matrix_argb[2] + a * matrix_argb[3]) >> 7;
+ r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
- r * matrix_argb[6] + a * matrix_argb[7]) >> 7;
+ r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
- r * matrix_argb[10] + a * matrix_argb[11]) >> 7;
+ r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
+ int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
+ r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
dst_argb[0] = Clamp(sb);
dst_argb[1] = Clamp(sg);
dst_argb[2] = Clamp(sr);
+ dst_argb[3] = Clamp(sa);
+ src_argb += 4;
dst_argb += 4;
}
}
@@ -683,6 +694,19 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
}
}
+// Apply color table to a row of image.
+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+ for (int x = 0; x < width; ++x) {
+ int b = dst_argb[0];
+ int g = dst_argb[1];
+ int r = dst_argb[2];
+ dst_argb[0] = table_argb[b * 4 + 0];
+ dst_argb[1] = table_argb[g * 4 + 1];
+ dst_argb[2] = table_argb[r * 4 + 2];
+ dst_argb += 4;
+ }
+}
+
void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
for (int x = 0; x < width; ++x) {
@@ -845,6 +869,16 @@ void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
}
}
+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ for (int i = 0; i < width; ++i) {
+ int r = src_sobelx[i];
+ int b = src_sobely[i];
+ int s = clamp255(r + b);
+ dst_y[i] = static_cast<uint8>(s);
+ }
+}
+
void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
for (int i = 0; i < width; ++i) {
@@ -1670,7 +1704,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
// Reciprocal method is off by 1 on some values. ie 125
// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
#define T(a) 0x01000000 + (0x10000 / a)
-uint32 fixed_invtbl8[256] = {
+const uint32 fixed_invtbl8[256] = {
0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
@@ -1774,10 +1808,26 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
}
}
+// Blend 2 rows into 1 for conversions such as I422ToI420.
+void HalfRow_C(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ for (int x = 0; x < pix; ++x) {
+ dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
+ }
+}
+
// C version 2x2 -> 2x1.
void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride,
int width, int source_y_fraction) {
+ if (source_y_fraction == 0) {
+ memcpy(dst_ptr, src_ptr, width);
+ return;
+ }
+ if (source_y_fraction == 128) {
+ HalfRow_C(src_ptr, static_cast<int>(src_stride), dst_ptr, width);
+ return;
+ }
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint8* src_ptr1 = src_ptr + src_stride;
@@ -1794,14 +1844,6 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
}
}
-// Blend 2 rows into 1 for conversions such as I422ToI420.
-void HalfRow_C(const uint8* src_uv, int src_uv_stride,
- uint8* dst_uv, int pix) {
- for (int x = 0; x < pix; ++x) {
- dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
- }
-}
-
// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
void ARGBToBayerRow_C(const uint8* src_argb,
uint8* dst_bayer, uint32 selector, int pix) {
@@ -1819,6 +1861,21 @@ void ARGBToBayerRow_C(const uint8* src_argb,
}
}
+// Select G channel from ARGB. e.g. GGGGGGGG
+void ARGBToBayerGGRow_C(const uint8* src_argb,
+ uint8* dst_bayer, uint32 /*selector*/, int pix) {
+ // Copy a row of G.
+ for (int x = 0; x < pix - 1; x += 2) {
+ dst_bayer[0] = src_argb[1];
+ dst_bayer[1] = src_argb[5];
+ src_argb += 8;
+ dst_bayer += 2;
+ }
+ if (pix & 1) {
+ dst_bayer[0] = src_argb[1];
+ }
+}
+
// Use first 4 shuffler values to reorder ARGB channels.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
@@ -1886,10 +1943,19 @@ void I422ToUYVYRow_C(const uint8* src_y,
}
}
-#if !defined(LIBYUV_DISABLE_X86)
+// TODO(fbarchard): Ensure these are stack safe.
+#ifdef DEBUG
+#define MAYBE_SAFEBUFFERS
+#else
+#define MAYBE_SAFEBUFFERS SAFEBUFFERS
+#endif
+
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
// row_win.cc has asm version, but GCC uses 2 step wrapper. 5% slower.
// TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
#if defined(__x86_64__) || defined(__i386__)
+MAYBE_SAFEBUFFERS
void I422ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1902,6 +1968,7 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
#endif // defined(__x86_64__) || defined(__i386__)
#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
+MAYBE_SAFEBUFFERS
void I422ToARGB1555Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1912,6 +1979,7 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y,
ARGBToARGB1555Row_SSE2(row, rgb_buf, width);
}
+MAYBE_SAFEBUFFERS
void I422ToARGB4444Row_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1922,6 +1990,7 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
ARGBToARGB4444Row_SSE2(row, rgb_buf, width);
}
+MAYBE_SAFEBUFFERS
void NV12ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_uv,
uint8* dst_rgb565,
@@ -1931,6 +2000,7 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y,
ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
}
+MAYBE_SAFEBUFFERS
void NV21ToRGB565Row_SSSE3(const uint8* src_y,
const uint8* src_vu,
uint8* dst_rgb565,
@@ -1940,6 +2010,7 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y,
ARGBToRGB565Row_SSE2(row, dst_rgb565, width);
}
+MAYBE_SAFEBUFFERS
void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
@@ -1951,6 +2022,7 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
}
+MAYBE_SAFEBUFFERS
void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
@@ -1962,6 +2034,7 @@ void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,
I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
}
+MAYBE_SAFEBUFFERS
void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
@@ -1973,6 +2046,7 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);
}
+MAYBE_SAFEBUFFERS
void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
@@ -1986,8 +2060,102 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
#endif // !defined(LIBYUV_DISABLE_X86)
-#undef clamp0
-#undef clamp255
+
+void ARGBPolynomialRow_C(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ for (int i = 0; i < width; ++i) {
+ float b = static_cast<float>(src_argb[0]);
+ float g = static_cast<float>(src_argb[1]);
+ float r = static_cast<float>(src_argb[2]);
+ float a = static_cast<float>(src_argb[3]);
+ float b2 = b * b;
+ float g2 = g * g;
+ float r2 = r * r;
+ float a2 = a * a;
+ float db = poly[0] + poly[4] * b;
+ float dg = poly[1] + poly[5] * g;
+ float dr = poly[2] + poly[6] * r;
+ float da = poly[3] + poly[7] * a;
+ db += poly[8] * b2;
+ dg += poly[9] * g2;
+ dr += poly[10] * r2;
+ da += poly[11] * a2;
+ float b3 = b2 * b;
+ float g3 = g2 * g;
+ float r3 = r2 * r;
+ float a3 = a2 * a;
+ db += poly[12] * b3;
+ dg += poly[13] * g3;
+ dr += poly[14] * r3;
+ da += poly[15] * a3;
+
+ dst_argb[0] = Clamp(static_cast<int32>(db));
+ dst_argb[1] = Clamp(static_cast<int32>(dg));
+ dst_argb[2] = Clamp(static_cast<int32>(dr));
+ dst_argb[3] = Clamp(static_cast<int32>(da));
+ src_argb += 4;
+ dst_argb += 4;
+ }
+}
+
+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
+ const uint8* luma, const uint32 lumacoeff) {
+ uint32 bc = lumacoeff & 0xff;
+ uint32 gc = (lumacoeff >> 8) & 0xff;
+ uint32 rc = (lumacoeff >> 16) & 0xff;
+
+ for (int i = 0; i < width - 1; i += 2) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ const uint8* luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
+ src_argb[6] * rc) & 0x7F00u) + luma;
+ dst_argb[4] = luma1[src_argb[4]];
+ dst_argb[5] = luma1[src_argb[5]];
+ dst_argb[6] = luma1[src_argb[6]];
+ dst_argb[7] = src_argb[7];
+ src_argb += 8;
+ dst_argb += 8;
+ }
+ if (width & 1) {
+ // Luminance in rows, color values in columns.
+ const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
+ src_argb[2] * rc) & 0x7F00u) + luma;
+ dst_argb[0] = luma0[src_argb[0]];
+ dst_argb[1] = luma0[src_argb[1]];
+ dst_argb[2] = luma0[src_argb[2]];
+ dst_argb[3] = src_argb[3];
+ }
+}
+
+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ for (int i = 0; i < width - 1; i += 2) {
+ dst[3] = src[3];
+ dst[7] = src[7];
+ dst += 8;
+ src += 8;
+ }
+ if (width & 1) {
+ dst[3] = src[3];
+ }
+}
+
+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+ for (int i = 0; i < width - 1; i += 2) {
+ dst[3] = src[0];
+ dst[7] = src[1];
+ dst += 8;
+ src += 2;
+ }
+ if (width & 1) {
+ dst[3] = src[0];
+ }
+}
#ifdef __cplusplus
} // extern "C"
diff --git a/chromium/third_party/libyuv/source/row_mips.cc b/chromium/third_party/libyuv/source/row_mips.cc
index 69677aa2d5b..4435c55c5ce 100644
--- a/chromium/third_party/libyuv/source/row_mips.cc
+++ b/chromium/third_party/libyuv/source/row_mips.cc
@@ -15,6 +15,9 @@ namespace libyuv {
extern "C" {
#endif
+// The following are available on Mips platforms:
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+
#ifdef HAS_COPYROW_MIPS
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
__asm__ __volatile__ (
@@ -383,6 +386,7 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
+ ".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0
@@ -449,6 +453,7 @@ void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
"blez $t4, 2f \n"
" andi %[width], %[width], 0xf \n" // residual
+ ".p2align 2 \n"
"1: \n"
"addiu $t4, $t4, -1 \n"
"lwr $t0, 0(%[src_uv]) \n"
@@ -532,7 +537,8 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
"blez $t4, 2f \n"
" addu %[src], %[src], %[width] \n" // src += width
- "1: \n"
+ ".p2align 2 \n"
+ "1: \n"
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
"lw $t2, -8(%[src]) \n" // |11|10|9|8|
@@ -556,7 +562,7 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
"beqz $t5, 3f \n"
" nop \n"
- "2: \n"
+ "2: \n"
"lbu $t0, -1(%[src]) \n"
"addiu $t5, $t5, -1 \n"
"addiu %[src], %[src], -1 \n"
@@ -564,7 +570,7 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
"bgez $t5, 2b \n"
" addiu %[dst], %[dst], 1 \n"
- "3: \n"
+ "3: \n"
".set pop \n"
: [src] "+r" (src), [dst] "+r" (dst)
: [width] "r" (width)
@@ -586,7 +592,8 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"blez %[x], 2f \n"
" addu %[src_uv], %[src_uv], $t4 \n"
- "1: \n"
+ ".p2align 2 \n"
+ "1: \n"
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
@@ -638,7 +645,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"b 2f \n"
" nop \n"
- "2: \n"
+ "2: \n"
"lbu $t0, -2(%[src_uv]) \n"
"lbu $t1, -1(%[src_uv]) \n"
"addiu %[src_uv], %[src_uv], -2 \n"
@@ -649,7 +656,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"bgtz %[y], 2b \n"
" addiu %[dst_v], %[dst_v], 1 \n"
- "3: \n"
+ "3: \n"
".set pop \n"
: [src_uv] "+r" (src_uv),
[dst_u] "+r" (dst_u),
@@ -670,62 +677,62 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
// t2 = | 0 | R0 | 0 | r0 |
// t1 = | 0 | R1 | 0 | r1 |
#define I422ToTransientMipsRGB \
- "lw $t0, 0(%[y_buf]) \n" \
- "lhu $t1, 0(%[u_buf]) \n" \
- "lhu $t2, 0(%[v_buf]) \n" \
- "preceu.ph.qbr $t1, $t1 \n" \
- "preceu.ph.qbr $t2, $t2 \n" \
- "preceu.ph.qbra $t3, $t0 \n" \
- "preceu.ph.qbla $t0, $t0 \n" \
- "subu.ph $t1, $t1, $s5 \n" \
- "subu.ph $t2, $t2, $s5 \n" \
- "subu.ph $t3, $t3, $s4 \n" \
- "subu.ph $t0, $t0, $s4 \n" \
- "mul.ph $t3, $t3, $s0 \n" \
- "mul.ph $t0, $t0, $s0 \n" \
- "shll.ph $t4, $t1, 0x7 \n" \
- "subu.ph $t4, $t4, $t1 \n" \
- "mul.ph $t6, $t1, $s1 \n" \
- "mul.ph $t1, $t2, $s2 \n" \
- "addq_s.ph $t5, $t4, $t3 \n" \
- "addq_s.ph $t4, $t4, $t0 \n" \
- "shra.ph $t5, $t5, 6 \n" \
- "shra.ph $t4, $t4, 6 \n" \
- "addiu %[u_buf], 2 \n" \
- "addiu %[v_buf], 2 \n" \
- "addu.ph $t6, $t6, $t1 \n" \
- "mul.ph $t1, $t2, $s3 \n" \
- "addu.ph $t9, $t6, $t3 \n" \
- "addu.ph $t8, $t6, $t0 \n" \
- "shra.ph $t9, $t9, 6 \n" \
- "shra.ph $t8, $t8, 6 \n" \
- "addu.ph $t2, $t1, $t3 \n" \
- "addu.ph $t1, $t1, $t0 \n" \
- "shra.ph $t2, $t2, 6 \n" \
- "shra.ph $t1, $t1, 6 \n" \
- "subu.ph $t5, $t5, $s5 \n" \
- "subu.ph $t4, $t4, $s5 \n" \
- "subu.ph $t9, $t9, $s5 \n" \
- "subu.ph $t8, $t8, $s5 \n" \
- "subu.ph $t2, $t2, $s5 \n" \
- "subu.ph $t1, $t1, $s5 \n" \
- "shll_s.ph $t5, $t5, 8 \n" \
- "shll_s.ph $t4, $t4, 8 \n" \
- "shll_s.ph $t9, $t9, 8 \n" \
- "shll_s.ph $t8, $t8, 8 \n" \
- "shll_s.ph $t2, $t2, 8 \n" \
- "shll_s.ph $t1, $t1, 8 \n" \
- "shra.ph $t5, $t5, 8 \n" \
- "shra.ph $t4, $t4, 8 \n" \
- "shra.ph $t9, $t9, 8 \n" \
- "shra.ph $t8, $t8, 8 \n" \
- "shra.ph $t2, $t2, 8 \n" \
- "shra.ph $t1, $t1, 8 \n" \
- "addu.ph $t5, $t5, $s5 \n" \
- "addu.ph $t4, $t4, $s5 \n" \
- "addu.ph $t9, $t9, $s5 \n" \
- "addu.ph $t8, $t8, $s5 \n" \
- "addu.ph $t2, $t2, $s5 \n" \
+ "lw $t0, 0(%[y_buf]) \n" \
+ "lhu $t1, 0(%[u_buf]) \n" \
+ "lhu $t2, 0(%[v_buf]) \n" \
+ "preceu.ph.qbr $t1, $t1 \n" \
+ "preceu.ph.qbr $t2, $t2 \n" \
+ "preceu.ph.qbra $t3, $t0 \n" \
+ "preceu.ph.qbla $t0, $t0 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t3, $t3, $s4 \n" \
+ "subu.ph $t0, $t0, $s4 \n" \
+ "mul.ph $t3, $t3, $s0 \n" \
+ "mul.ph $t0, $t0, $s0 \n" \
+ "shll.ph $t4, $t1, 0x7 \n" \
+ "subu.ph $t4, $t4, $t1 \n" \
+ "mul.ph $t6, $t1, $s1 \n" \
+ "mul.ph $t1, $t2, $s2 \n" \
+ "addq_s.ph $t5, $t4, $t3 \n" \
+ "addq_s.ph $t4, $t4, $t0 \n" \
+ "shra.ph $t5, $t5, 6 \n" \
+ "shra.ph $t4, $t4, 6 \n" \
+ "addiu %[u_buf], 2 \n" \
+ "addiu %[v_buf], 2 \n" \
+ "addu.ph $t6, $t6, $t1 \n" \
+ "mul.ph $t1, $t2, $s3 \n" \
+ "addu.ph $t9, $t6, $t3 \n" \
+ "addu.ph $t8, $t6, $t0 \n" \
+ "shra.ph $t9, $t9, 6 \n" \
+ "shra.ph $t8, $t8, 6 \n" \
+ "addu.ph $t2, $t1, $t3 \n" \
+ "addu.ph $t1, $t1, $t0 \n" \
+ "shra.ph $t2, $t2, 6 \n" \
+ "shra.ph $t1, $t1, 6 \n" \
+ "subu.ph $t5, $t5, $s5 \n" \
+ "subu.ph $t4, $t4, $s5 \n" \
+ "subu.ph $t9, $t9, $s5 \n" \
+ "subu.ph $t8, $t8, $s5 \n" \
+ "subu.ph $t2, $t2, $s5 \n" \
+ "subu.ph $t1, $t1, $s5 \n" \
+ "shll_s.ph $t5, $t5, 8 \n" \
+ "shll_s.ph $t4, $t4, 8 \n" \
+ "shll_s.ph $t9, $t9, 8 \n" \
+ "shll_s.ph $t8, $t8, 8 \n" \
+ "shll_s.ph $t2, $t2, 8 \n" \
+ "shll_s.ph $t1, $t1, 8 \n" \
+ "shra.ph $t5, $t5, 8 \n" \
+ "shra.ph $t4, $t4, 8 \n" \
+ "shra.ph $t9, $t9, 8 \n" \
+ "shra.ph $t8, $t8, 8 \n" \
+ "shra.ph $t2, $t2, 8 \n" \
+ "shra.ph $t1, $t1, 8 \n" \
+ "addu.ph $t5, $t5, $s5 \n" \
+ "addu.ph $t4, $t4, $s5 \n" \
+ "addu.ph $t9, $t9, $s5 \n" \
+ "addu.ph $t8, $t8, $s5 \n" \
+ "addu.ph $t2, $t2, $s5 \n" \
"addu.ph $t1, $t1, $s5 \n"
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
@@ -745,7 +752,9 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
"repl.ph $s5, 128 \n" // |128|128| // clipping
"lui $s6, 0xff00 \n"
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
- "1: \n"
+
+ ".p2align 2 \n"
+ "1: \n"
I422ToTransientMipsRGB
// Arranging into argb format
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
@@ -773,7 +782,7 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
- "2: \n"
+ "2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
@@ -794,47 +803,49 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm__ __volatile__ (
- ".set push \n\t"
- ".set noreorder \n\t"
- "beqz %[width], 2f \n\t"
- " repl.ph $s0, 74 \n\t" // |YG|YG| = |74|74|
- "repl.ph $s1, -25 \n\t" // |UG|UG| = |-25|-25|
- "repl.ph $s2, -52 \n\t" // |VG|VG| = |-52|-52|
- "repl.ph $s3, 102 \n\t" // |VR|VR| = |102|102|
- "repl.ph $s4, 16 \n\t" // |0|16|0|16|
- "repl.ph $s5, 128 \n\t" // |128|128|
- "lui $s6, 0xff00 \n\t"
- "ori $s6, 0xff00 \n\t" // |ff|00|ff|00|
- "1: \n"
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
+ "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
+ "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
+ "repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
+ "repl.ph $s4, 16 \n" // |0|16|0|16|
+ "repl.ph $s5, 128 \n" // |128|128|
+ "lui $s6, 0xff00 \n"
+ "ori $s6, 0xff00 \n" // |ff|00|ff|00|
+
+ ".p2align 2 \n"
+ "1: \n"
I422ToTransientMipsRGB
// Arranging into abgr format
- "precr.qb.ph $t0, $t8, $t1 \n\t" // |G1|g1|R1|r1|
- "precr.qb.ph $t3, $t9, $t2 \n\t" // |G0|g0|R0|r0|
- "precrq.qb.ph $t8, $t0, $t3 \n\t" // |G1|R1|G0|R0|
- "precr.qb.ph $t9, $t0, $t3 \n\t" // |g1|r1|g0|r0|
-
- "precr.qb.ph $t2, $t4, $t5 \n\t" // |B1|b1|B0|b0|
- "addiu %[width], -4 \n\t"
- "addiu %[y_buf], 4 \n\t"
- "preceu.ph.qbla $t1, $t2 \n\t" // |0 |B1|0 |B0|
- "preceu.ph.qbra $t2, $t2 \n\t" // |0 |b1|0 |b0|
- "or $t1, $t1, $s6 \n\t" // |ff|B1|ff|B0|
- "or $t2, $t2, $s6 \n\t" // |ff|b1|ff|b0|
- "precrq.ph.w $t0, $t2, $t9 \n\t" // |ff|b1|g1|r1|
- "precrq.ph.w $t3, $t1, $t8 \n\t" // |ff|B1|G1|R1|
- "sll $t9, $t9, 16 \n\t"
- "sll $t8, $t8, 16 \n\t"
- "packrl.ph $t2, $t2, $t9 \n\t" // |ff|b0|g0|r0|
- "packrl.ph $t1, $t1, $t8 \n\t" // |ff|B0|G0|R0|
+ "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1|
+ "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0|
+ "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0|
+ "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0|
+
+ "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0|
+ "addiu %[width], -4 \n"
+ "addiu %[y_buf], 4 \n"
+ "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0|
+ "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0|
+ "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0|
+ "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0|
+ "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1|
+ "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1|
+ "sll $t9, $t9, 16 \n"
+ "sll $t8, $t8, 16 \n"
+ "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0|
+ "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0|
// Store results.
- "sw $t2, 0(%[rgb_buf]) \n\t"
- "sw $t0, 4(%[rgb_buf]) \n\t"
- "sw $t1, 8(%[rgb_buf]) \n\t"
- "sw $t3, 12(%[rgb_buf]) \n\t"
- "bnez %[width], 1b \n\t"
- " addiu %[rgb_buf], 16 \n\t"
- "2: \n\t"
- ".set pop \n\t"
+ "sw $t2, 0(%[rgb_buf]) \n"
+ "sw $t0, 4(%[rgb_buf]) \n"
+ "sw $t1, 8(%[rgb_buf]) \n"
+ "sw $t3, 12(%[rgb_buf]) \n"
+ "bnez %[width], 1b \n"
+ " addiu %[rgb_buf], 16 \n"
+ "2: \n"
+ ".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
[v_buf] "+r" (v_buf),
@@ -865,13 +876,15 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
"repl.ph $s5, 128 \n" // |128|128|
"lui $s6, 0xff \n"
"ori $s6, 0xff \n" // |00|ff|00|ff|
- "1: \n"
+
+ ".p2align 2 \n"
+ "1: \n"
I422ToTransientMipsRGB
// Arranging into bgra format
- "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
- "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
- "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
- "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
+ "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
+ "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
+ "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
+ "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
"addiu %[width], -4 \n"
@@ -895,7 +908,7 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
"sw $t3, 12(%[rgb_buf]) \n"
"bnez %[width], 1b \n"
" addiu %[rgb_buf], 16 \n"
- "2: \n"
+ "2: \n"
".set pop \n"
:[y_buf] "+r" (y_buf),
[u_buf] "+r" (u_buf),
@@ -923,6 +936,8 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
"replv.ph $t0, %[y0_fraction] \n"
"replv.ph $t1, %[source_y_fraction] \n"
+
+ ".p2align 2 \n"
"1: \n"
"lw $t2, 0(%[src_ptr]) \n"
"lw $t3, 0(%[src_ptr1]) \n"
@@ -968,6 +983,8 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // __mips_dsp_rev >= 2
+#endif // defined(__mips__)
+
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc
index 0bb55e717be..5e802194b2b 100644
--- a/chromium/third_party/libyuv/source/row_neon.cc
+++ b/chromium/third_party/libyuv/source/row_neon.cc
@@ -102,10 +102,10 @@ extern "C" {
"vtrn.u8 d16, d17 \n" \
"vmov.u8 d21, d16 \n"
-static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
- 0, 0, 0, 0, 0, 0, 0, 0 };
-static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
- 0, 0, 0, 0, 0, 0, 0, 0 };
+static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
@@ -118,7 +118,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV444
YUV422TORGB
@@ -149,7 +149,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -180,7 +180,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV411
YUV422TORGB
@@ -211,7 +211,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -243,7 +243,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -275,7 +275,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -306,7 +306,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -336,7 +336,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -379,7 +379,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -425,7 +425,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -467,7 +467,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUV422TORGB
@@ -497,7 +497,7 @@ void YToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV400
YUV422TORGB
@@ -519,7 +519,7 @@ void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"vmov.u8 d23, #255 \n"
"1: \n"
"vld1.8 {d20}, [%0]! \n"
@@ -546,7 +546,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READNV12
YUV422TORGB
@@ -575,7 +575,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READNV21
YUV422TORGB
@@ -604,7 +604,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READNV12
YUV422TORGB
@@ -633,7 +633,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READNV21
YUV422TORGB
@@ -661,7 +661,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READYUY2
YUV422TORGB
@@ -688,7 +688,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
"vmov.u16 q15, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
READUYVY
YUV422TORGB
@@ -710,7 +710,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
@@ -730,7 +730,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V
@@ -750,7 +750,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
@@ -796,7 +796,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2 \n"
"sub %0, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop.
@@ -820,7 +820,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"add %0, %0, %3, lsl #1 \n"
"sub %0, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop.
@@ -844,7 +844,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"add %0, %0, %2, lsl #2 \n"
"sub %0, #16 \n"
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop.
@@ -863,7 +863,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"vmov.u8 d4, #255 \n" // Alpha
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -880,7 +880,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
"vmov.u8 d4, #255 \n" // Alpha
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -910,7 +910,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -956,7 +956,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
int pix) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -985,7 +985,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
int pix) {
asm volatile (
"vmov.u8 d3, #255 \n" // Alpha
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1002,7 +1002,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1018,7 +1018,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1035,7 +1035,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
@@ -1051,7 +1051,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
@@ -1068,7 +1068,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@@ -1087,7 +1087,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@@ -1107,7 +1107,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_yuy2
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
@@ -1131,7 +1131,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix) {
asm volatile (
"add %1, %0, %1 \n" // stride + src_uyvy
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
@@ -1193,6 +1193,23 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
);
}
+// Select G channels from ARGB. e.g. GGGGGGGG
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /*selector*/, int pix) {
+ asm volatile (
+ "1: \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d1}, [%1]! \n" // store 8 G's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
@@ -1218,7 +1235,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_v,
uint8* dst_yuy2, int width) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
"vld1.8 {d1}, [%1]! \n" // load 8 Us
@@ -1241,7 +1258,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_v,
uint8* dst_uyvy, int width) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
"vld1.8 {d0}, [%1]! \n" // load 8 Us
@@ -1261,7 +1278,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1279,7 +1296,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
int pix) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1298,7 +1315,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
int pix) {
asm volatile (
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1319,7 +1336,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1343,7 +1360,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1371,7 +1388,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
"vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
@@ -1410,7 +1427,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
@@ -1456,7 +1473,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
@@ -1521,7 +1538,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
@@ -1560,7 +1577,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.s16 q13, #20 / 4 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 4 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
@@ -1598,7 +1615,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
@@ -1636,7 +1653,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
@@ -1674,7 +1691,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
@@ -1712,7 +1729,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
@@ -1750,7 +1767,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
@@ -1789,7 +1806,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
@@ -1849,7 +1866,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
@@ -1909,7 +1926,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
@@ -1964,7 +1981,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -1990,7 +2007,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2016,7 +2033,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2042,7 +2059,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2067,7 +2084,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2092,7 +2109,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2117,7 +2134,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2142,7 +2159,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2412,7 +2429,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
@@ -2447,7 +2464,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d28, #24 \n" // BB coefficient
"vmov.u8 d29, #98 \n" // BG coefficient
"vmov.u8 d30, #50 \n" // BR coefficient
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
@@ -2474,18 +2491,19 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
}
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
-// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
- int width) {
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
asm volatile (
- "vld1.8 {q2}, [%2] \n" // load 3 ARGB vectors.
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
- "vmovl.s8 q1, d5 \n" // R coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
- "vld4.8 {d16, d18, d20, d22}, [%0] \n" // load 8 ARGB pixels.
- "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
"vmovl.u8 q9, d18 \n" // g
"vmovl.u8 q10, d20 \n" // r
@@ -2493,33 +2511,42 @@ void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb,
"vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
"vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
"vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
"vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
"vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
"vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
"vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
"vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
"vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
"vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
"vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
"vqadd.s16 q12, q12, q4 \n" // Accumulate B
"vqadd.s16 q13, q13, q5 \n" // Accumulate G
"vqadd.s16 q14, q14, q6 \n" // Accumulate R
- "vqshrun.s16 d16, q12, #7 \n" // 16 bit to 8 bit B
- "vqshrun.s16 d18, q13, #7 \n" // 16 bit to 8 bit G
- "vqshrun.s16 d20, q14, #7 \n" // 16 bit to 8 bit R
- "vst4.8 {d16, d18, d20, d22}, [%0]! \n" // store 8 ARGB pixels.
- "bgt 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(matrix_argb) // %2
- : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9",
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13", "q14", "q15"
);
}
@@ -2531,7 +2558,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
@@ -2562,7 +2589,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
@@ -2586,7 +2613,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
// 8 pixel loop.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
@@ -2615,7 +2642,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
@@ -2627,8 +2654,30 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
"bgt 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
- "+r"(dst_argb), // %2
- "+r"(width) // %3
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ // 16 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
:
: "cc", "memory", "q0", "q1"
);
@@ -2644,7 +2693,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
asm volatile (
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
@@ -2668,7 +2717,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n"
@@ -2705,7 +2754,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
asm volatile (
- ".p2align 2 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n"
diff --git a/chromium/third_party/libyuv/source/row_posix.cc b/chromium/third_party/libyuv/source/row_posix.cc
index b92a9f5c13b..539d871535b 100644
--- a/chromium/third_party/libyuv/source/row_posix.cc
+++ b/chromium/third_party/libyuv/source/row_posix.cc
@@ -10,155 +10,292 @@
#include "libyuv/row.h"
-#include "libyuv/basic_types.h"
-
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// This module is for GCC x86 and x64
+// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
+// TODO(nfullagar): For Native Client: When new toolchain becomes available,
+// take advantage of bundle lock / unlock feature. This will reduce the amount
+// of manual bundle alignment done below, and bundle alignment could even be
+// moved into each macro that doesn't use %%nacl: such as MEMOPREG. Consider
+// unmunging functions to reduce complex addressing modes.
+
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " (%%r15,%%r14),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " %%" #reg ",(%%r15,%%r14)\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " (%%r15,%%r14)"
+#define BUNDLEALIGN ".p2align 5\n"
#else
-#define CONST static const
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMMOVESTRING(s, d)
+#define MEMSTORESTRING(reg, d)
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale ")"
+#define BUNDLEALIGN
#endif
-#ifdef HAS_ARGBTOYROW_SSSE3
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
-CONST vec8 kARGBToY = {
+static vec8 kARGBToY = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
// JPeg full range.
-CONST vec8 kARGBToYJ = {
+static vec8 kARGBToYJ = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
+
+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
-CONST vec8 kARGBToU = {
+static vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
-CONST vec8 kARGBToUJ = {
+static vec8 kARGBToUJ = {
127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
};
-CONST vec8 kARGBToV = {
+static vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
-CONST vec8 kARGBToVJ = {
+static vec8 kARGBToVJ = {
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
};
// Constants for BGRA
-CONST vec8 kBGRAToY = {
+static vec8 kBGRAToY = {
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
};
-CONST vec8 kBGRAToU = {
+static vec8 kBGRAToU = {
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
};
-CONST vec8 kBGRAToV = {
+static vec8 kBGRAToV = {
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
};
// Constants for ABGR
-CONST vec8 kABGRToY = {
+static vec8 kABGRToY = {
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
};
-CONST vec8 kABGRToU = {
+static vec8 kABGRToU = {
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
};
-CONST vec8 kABGRToV = {
+static vec8 kABGRToV = {
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
};
// Constants for RGBA.
-CONST vec8 kRGBAToY = {
+static vec8 kRGBAToY = {
0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
};
-CONST vec8 kRGBAToU = {
+static vec8 kRGBAToU = {
0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
};
-CONST vec8 kRGBAToV = {
+static vec8 kRGBAToV = {
0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
};
-CONST uvec8 kAddY16 = {
+static uvec8 kAddY16 = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
-CONST vec16 kAddYJ64 = {
+static vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64
};
-CONST uvec8 kAddUV128 = {
+static uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
-CONST uvec16 kAddUVJ128 = {
+static uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
};
+#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
+
+#ifdef HAS_RGB24TOARGBROW_SSSE3
// Shuffle table for converting RGB24 to ARGB.
-CONST uvec8 kShuffleMaskRGB24ToARGB = {
+static uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
};
// Shuffle table for converting RAW to ARGB.
-CONST uvec8 kShuffleMaskRAWToARGB = {
+static uvec8 kShuffleMaskRAWToARGB = {
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
};
// Shuffle table for converting ARGB to RGB24.
-CONST uvec8 kShuffleMaskARGBToRGB24 = {
+static uvec8 kShuffleMaskARGBToRGB24 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting ARGB to RAW.
-CONST uvec8 kShuffleMaskARGBToRAW = {
+static uvec8 kShuffleMaskARGBToRAW = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
-CONST uvec8 kShuffleMaskARGBToRGB24_0 = {
+static uvec8 kShuffleMaskARGBToRGB24_0 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
};
// Shuffle table for converting ARGB to RAW.
-CONST uvec8 kShuffleMaskARGBToRAW_0 = {
+static uvec8 kShuffleMaskARGBToRAW_0 = {
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
};
+#endif // HAS_RGB24TOARGBROW_SSSE3
+
+#if defined(TESTING) && defined(__x86_64__)
+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
+ asm volatile (
+ ".p2align 5 \n"
+ "mov %%eax,%%eax \n"
+ "mov %%ebx,%%ebx \n"
+ "mov %%ecx,%%ecx \n"
+ "mov %%edx,%%edx \n"
+ "mov %%esi,%%esi \n"
+ "mov %%edi,%%edi \n"
+ "mov %%ebp,%%ebp \n"
+ "mov %%esp,%%esp \n"
+ ".p2align 5 \n"
+ "mov %%r8d,%%r8d \n"
+ "mov %%r9d,%%r9d \n"
+ "mov %%r10d,%%r10d \n"
+ "mov %%r11d,%%r11d \n"
+ "mov %%r12d,%%r12d \n"
+ "mov %%r13d,%%r13d \n"
+ "mov %%r14d,%%r14d \n"
+ "mov %%r15d,%%r15d \n"
+ ".p2align 5 \n"
+ "lea (%%rax),%%eax \n"
+ "lea (%%rbx),%%ebx \n"
+ "lea (%%rcx),%%ecx \n"
+ "lea (%%rdx),%%edx \n"
+ "lea (%%rsi),%%esi \n"
+ "lea (%%rdi),%%edi \n"
+ "lea (%%rbp),%%ebp \n"
+ "lea (%%rsp),%%esp \n"
+ ".p2align 5 \n"
+ "lea (%%r8),%%r8d \n"
+ "lea (%%r9),%%r9d \n"
+ "lea (%%r10),%%r10d \n"
+ "lea (%%r11),%%r11d \n"
+ "lea (%%r12),%%r12d \n"
+ "lea (%%r13),%%r13d \n"
+ "lea (%%r14),%%r14d \n"
+ "lea (%%r15),%%r15d \n"
+
+ ".p2align 5 \n"
+ "lea 0x10(%%rax),%%eax \n"
+ "lea 0x10(%%rbx),%%ebx \n"
+ "lea 0x10(%%rcx),%%ecx \n"
+ "lea 0x10(%%rdx),%%edx \n"
+ "lea 0x10(%%rsi),%%esi \n"
+ "lea 0x10(%%rdi),%%edi \n"
+ "lea 0x10(%%rbp),%%ebp \n"
+ "lea 0x10(%%rsp),%%esp \n"
+ ".p2align 5 \n"
+ "lea 0x10(%%r8),%%r8d \n"
+ "lea 0x10(%%r9),%%r9d \n"
+ "lea 0x10(%%r10),%%r10d \n"
+ "lea 0x10(%%r11),%%r11d \n"
+ "lea 0x10(%%r12),%%r12d \n"
+ "lea 0x10(%%r13),%%r13d \n"
+ "lea 0x10(%%r14),%%r14d \n"
+ "lea 0x10(%%r15),%%r15d \n"
+
+ ".p2align 5 \n"
+ "add 0x10,%%eax \n"
+ "add 0x10,%%ebx \n"
+ "add 0x10,%%ecx \n"
+ "add 0x10,%%edx \n"
+ "add 0x10,%%esi \n"
+ "add 0x10,%%edi \n"
+ "add 0x10,%%ebp \n"
+ "add 0x10,%%esp \n"
+ ".p2align 5 \n"
+ "add 0x10,%%r8d \n"
+ "add 0x10,%%r9d \n"
+ "add 0x10,%%r10d \n"
+ "add 0x10,%%r11d \n"
+ "add 0x10,%%r12d \n"
+ "add 0x10,%%r13d \n"
+ "add 0x10,%%r14d \n"
+ "add 0x10,%%r15d \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // TESTING
+#ifdef HAS_I400TOARGBROW_SSE2
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
@@ -177,19 +314,19 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm0 \n"
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm5,%%xmm0 \n"
"por %%xmm5,%%xmm1 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
@@ -202,36 +339,39 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
#endif
);
}
+#endif // HAS_I400TOARGBROW_SSE2
+#ifdef HAS_RGB24TOARGBROW_SSSE3
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2,0x20(%1) \n"
+ "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1,0x10(%1) \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
+ "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
@@ -249,30 +389,31 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
"pslld $0x18,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm3 \n"
- "lea 0x30(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x30,0) ",%0 \n"
"movdqa %%xmm3,%%xmm2 \n"
"palignr $0x8,%%xmm1,%%xmm2 \n"
"pshufb %%xmm4,%%xmm2 \n"
"por %%xmm5,%%xmm2 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
"pshufb %%xmm4,%%xmm0 \n"
- "movdqa %%xmm2,0x20(%1) \n"
+ "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
"por %%xmm5,%%xmm0 \n"
"pshufb %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"palignr $0x4,%%xmm3,%%xmm3 \n"
"pshufb %%xmm4,%%xmm3 \n"
- "movdqa %%xmm1,0x10(%1) \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
"por %%xmm5,%%xmm3 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm3,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
+ "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
@@ -302,9 +443,10 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"psllw $0x8,%%xmm7 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pand %%xmm3,%%xmm1 \n"
@@ -319,9 +461,10 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,(%1,%0,2) \n"
- "movdqa %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -329,6 +472,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"+r"(pix) // %2
:
: "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
@@ -351,9 +497,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"psllw $0x8,%%xmm7 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"psllw $0x1,%%xmm1 \n"
@@ -372,9 +519,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
- "movdqa %%xmm1,(%1,%0,2) \n"
- "movdqa %%xmm2,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
+ MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -382,6 +530,9 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"+r"(pix) // %2
:
: "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
@@ -397,9 +548,10 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0x4,%%xmm5 \n"
"sub %0,%1 \n"
"sub %0,%1 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pand %%xmm4,%%xmm0 \n"
"pand %%xmm5,%%xmm2 \n"
@@ -412,9 +564,10 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%1,%0,2) \n"
- "movdqa %%xmm1,0x10(%1,%0,2) \n"
- "lea 0x10(%0),%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
+ MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -422,6 +575,9 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
"+r"(pix) // %2
:
: "memory", "cc", "eax"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -431,13 +587,14 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"pshufb %%xmm6,%%xmm0 \n"
"pshufb %%xmm6,%%xmm1 \n"
"pshufb %%xmm6,%%xmm2 \n"
@@ -448,14 +605,14 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm2,%%xmm5 \n"
"por %%xmm4,%%xmm0 \n"
"pslldq $0x8,%%xmm5 \n"
- "movdqa %%xmm0,(%1) \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"psrldq $0x8,%%xmm2 \n"
"pslldq $0x4,%%xmm3 \n"
"por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -472,13 +629,14 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
asm volatile (
"movdqa %3,%%xmm6 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"pshufb %%xmm6,%%xmm0 \n"
"pshufb %%xmm6,%%xmm1 \n"
"pshufb %%xmm6,%%xmm2 \n"
@@ -489,14 +647,14 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
"movdqa %%xmm2,%%xmm5 \n"
"por %%xmm4,%%xmm0 \n"
"pslldq $0x8,%%xmm5 \n"
- "movdqa %%xmm0,(%1) \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
"por %%xmm5,%%xmm1 \n"
"psrldq $0x8,%%xmm2 \n"
"pslldq $0x4,%%xmm3 \n"
"por %%xmm3,%%xmm2 \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "movdqa %%xmm2,0x20(%1) \n"
- "lea 0x30(%1),%1 \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x30,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -519,9 +677,10 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0x5,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0xb,%%xmm5 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"pslld $0x8,%%xmm0 \n"
@@ -534,9 +693,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
"por %%xmm2,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -560,9 +719,10 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"pslld $0xa,%%xmm6 \n"
"pcmpeqb %%xmm7,%%xmm7 \n"
"pslld $0xf,%%xmm7 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm0,%%xmm3 \n"
@@ -578,9 +738,9 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
"por %%xmm3,%%xmm2 \n"
"por %%xmm2,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMACCESS2(0x8,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -600,9 +760,10 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"psllw $0xc,%%xmm4 \n"
"movdqa %%xmm4,%%xmm3 \n"
"psrlw $0x8,%%xmm3 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"pand %%xmm3,%%xmm0 \n"
"pand %%xmm4,%%xmm1 \n"
@@ -610,9 +771,9 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
"psrlq $0x8,%%xmm1 \n"
"por %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -625,22 +786,24 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
#endif
);
}
+#endif // HAS_RGB24TOARGBROW_SSSE3
+#ifdef HAS_ARGBTOYROW_SSSE3
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
@@ -648,8 +811,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@@ -663,74 +826,76 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
);
}
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
- ".p2align 4 \n"
+ "movdqa %3,%%xmm4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
- "paddw %%xmm5,%%xmm0 \n"
- "paddw %%xmm5,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
+ "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
- : "m"(kARGBToYJ), // %3
- "m"(kAddYJ64) // %4
+ : "m"(kARGBToY), // %3
+ "m"(kAddY16) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
);
}
+#endif // HAS_ARGBTOYROW_SSSE3
-void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
+#ifdef HAS_ARGBTOYJROW_SSSE3
+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
- "movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ "movdqa %4,%%xmm5 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
+ "paddw %%xmm5,%%xmm0 \n"
+ "paddw %%xmm5,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
- "paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
- : "m"(kARGBToY), // %3
- "m"(kAddY16) // %4
+ : "m"(kARGBToYJ), // %3
+ "m"(kAddYJ64) // %4
: "memory", "cc"
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
@@ -742,17 +907,17 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n"
@@ -761,8 +926,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
@@ -775,7 +940,9 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
#endif
);
}
+#endif // HAS_ARGBTOYJROW_SSSE3
+#ifdef HAS_ARGBTOUVROW_SSSE3
// TODO(fbarchard): pass xmm constants to single block of assembly.
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
@@ -794,17 +961,19 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -826,16 +995,20 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "r"(static_cast<intptr_t>(src_stride_argb)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -856,17 +1029,19 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -889,16 +1064,20 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "r"(static_cast<intptr_t>(src_stride_argb)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -918,21 +1097,23 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -954,16 +1135,20 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_argb))
+ : "r"(static_cast<intptr_t>(src_stride_argb)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -983,21 +1168,23 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1020,9 +1207,10 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
"psraw $0x8,%%xmm1 \n"
"packsswb %%xmm1,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1030,6 +1218,9 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_argb))
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1049,12 +1240,13 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -1066,11 +1258,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm2 \n"
@@ -1081,9 +1273,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"psraw $0x8,%%xmm2 \n"
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "lea 0x40(%0),%0 \n"
- "movdqa %%xmm0,(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
@@ -1091,6 +1284,9 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"+rm"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6"
#endif
@@ -1110,12 +1306,13 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
@@ -1127,11 +1324,11 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm0 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm2 \n"
@@ -1142,9 +1339,10 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
"psraw $0x8,%%xmm2 \n"
"packsswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
- "lea 0x40(%0),%0 \n"
- "movdqu %%xmm0,(%1,%2,1) \n"
- "lea 0x10(%1),%1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
@@ -1152,6 +1350,9 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
"+rm"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6"
#endif
@@ -1171,13 +1372,14 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1199,9 +1401,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1209,6 +1412,9 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
"+rm"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1228,13 +1434,14 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1256,9 +1463,10 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(dst_u), // %1
@@ -1266,6 +1474,9 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
"+rm"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1276,17 +1487,18 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
@@ -1294,8 +1506,8 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
@@ -1313,17 +1525,18 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
@@ -1331,8 +1544,8 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
@@ -1359,17 +1572,19 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1391,16 +1606,20 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_bgra0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_bgra))
+ : "r"(static_cast<intptr_t>(src_stride_bgra)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1420,21 +1639,23 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1456,16 +1677,20 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_bgra0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_bgra))
+ : "r"(static_cast<intptr_t>(src_stride_bgra)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1476,17 +1701,18 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
@@ -1494,8 +1720,8 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
@@ -1513,17 +1739,18 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
@@ -1531,8 +1758,8 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
@@ -1550,17 +1777,18 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
@@ -1568,8 +1796,8 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
@@ -1587,17 +1815,18 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
@@ -1605,8 +1834,8 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
@@ -1633,17 +1862,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1665,16 +1896,20 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_abgr))
+ : "r"(static_cast<intptr_t>(src_stride_abgr)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1694,21 +1929,23 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1730,16 +1967,20 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_abgr0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_abgr))
+ : "r"(static_cast<intptr_t>(src_stride_abgr)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1759,17 +2000,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm6 \n"
- "pavgb (%0,%4,1),%%xmm0 \n"
- "pavgb 0x10(%0,%4,1),%%xmm1 \n"
- "pavgb 0x20(%0,%4,1),%%xmm2 \n"
- "pavgb 0x30(%0,%4,1),%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
+ MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
+ MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
+ MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1791,9 +2034,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgba0), // %0
"+r"(dst_u), // %1
@@ -1801,6 +2045,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"+rm"(width) // %3
: "r"(static_cast<intptr_t>(src_stride_rgba))
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
@@ -1820,21 +2067,23 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
);
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu 0x20(%0),%%xmm2 \n"
- "movdqu 0x30(%0),%%xmm6 \n"
- "movdqu (%0,%4,1),%%xmm7 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm0 \n"
- "movdqu 0x10(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm1 \n"
- "movdqu 0x20(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm2 \n"
- "movdqu 0x30(%0,%4,1),%%xmm7 \n"
+ MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
"pavgb %%xmm7,%%xmm6 \n"
- "lea 0x40(%0),%0 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
"movdqa %%xmm0,%%xmm7 \n"
"shufps $0x88,%%xmm1,%%xmm0 \n"
"shufps $0xdd,%%xmm1,%%xmm7 \n"
@@ -1856,22 +2105,26 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
"packsswb %%xmm1,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movlps %%xmm0,(%1) \n"
- "movhps %%xmm0,(%1,%2,1) \n"
- "lea 0x8(%1),%1 \n"
+ "movlps %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_rgba0), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
- : "r"(static_cast<intptr_t>(src_stride_rgba))
+ : "r"(static_cast<intptr_t>(src_stride_rgba)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
#endif
);
}
-#endif // HAS_ARGBTOYROW_SSSE3
+#endif // HAS_ARGBTOUVROW_SSSE3
#ifdef HAS_I422TOARGBROW_SSSE3
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
@@ -1901,7 +2154,7 @@ struct {
vec8 kVUToB; // 128
vec8 kVUToG; // 144
vec8 kVUToR; // 160
-} CONST SIMD_ALIGNED(kYuvConstants) = {
+} static SIMD_ALIGNED(kYuvConstants) = {
{ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
{ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
@@ -1918,83 +2171,86 @@ struct {
// Read 8 UV from 411
#define READYUV444 \
- "movq (%[u_buf]),%%xmm0 \n" \
- "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x8(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
+ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ BUNDLEALIGN \
+ MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n"
// Read 4 UV from 422, upsample to 8 UV
#define READYUV422 \
- "movd (%[u_buf]),%%xmm0 \n" \
- "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x4(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ BUNDLEALIGN \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n"
// Read 2 UV from 411, upsample to 8 UV
#define READYUV411 \
- "movd (%[u_buf]),%%xmm0 \n" \
- "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \
- "lea 0x2(%[u_buf]),%[u_buf] \n" \
- "punpcklbw %%xmm1,%%xmm0 \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
- "punpckldq %%xmm0,%%xmm0 \n" \
+ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
+ BUNDLEALIGN \
+ MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
+ "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
+ "punpcklbw %%xmm1,%%xmm0 \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n" \
+ "punpckldq %%xmm0,%%xmm0 \n"
// Read 4 UV from NV12, upsample to 8 UV
#define READNV12 \
- "movq (%[uv_buf]),%%xmm0 \n" \
- "lea 0x8(%[uv_buf]),%[uv_buf] \n" \
- "punpcklwd %%xmm0,%%xmm0 \n" \
+ "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
+ "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
+ "punpcklwd %%xmm0,%%xmm0 \n"
// Convert 8 pixels: 8 UV and 8 Y
#define YUVTORGB \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \
- "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \
- "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \
- "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
- "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
- "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
- "movq (%[y_buf]),%%xmm3 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
- "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \
+ "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
+ "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
+ "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
+ "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
// Convert 8 pixels: 8 VU and 8 Y
#define YVUTORGB \
- "movdqa %%xmm0,%%xmm1 \n" \
- "movdqa %%xmm0,%%xmm2 \n" \
- "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \
- "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \
- "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \
- "psubw 48(%[kYuvConstants]),%%xmm0 \n" \
- "psubw 64(%[kYuvConstants]),%%xmm1 \n" \
- "psubw 80(%[kYuvConstants]),%%xmm2 \n" \
- "movq (%[y_buf]),%%xmm3 \n" \
- "lea 0x8(%[y_buf]),%[y_buf] \n" \
- "punpcklbw %%xmm4,%%xmm3 \n" \
- "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \
- "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \
- "paddsw %%xmm3,%%xmm0 \n" \
- "paddsw %%xmm3,%%xmm1 \n" \
- "paddsw %%xmm3,%%xmm2 \n" \
- "psraw $0x6,%%xmm0 \n" \
- "psraw $0x6,%%xmm1 \n" \
- "psraw $0x6,%%xmm2 \n" \
- "packuswb %%xmm0,%%xmm0 \n" \
- "packuswb %%xmm1,%%xmm1 \n" \
- "packuswb %%xmm2,%%xmm2 \n" \
+ "movdqa %%xmm0,%%xmm1 \n" \
+ "movdqa %%xmm0,%%xmm2 \n" \
+ "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \
+ "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \
+ "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \
+ "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
+ "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
+ "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
+ "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
+ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
+ "punpcklbw %%xmm4,%%xmm3 \n" \
+ "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
+ "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
+ "paddsw %%xmm3,%%xmm0 \n" \
+ "paddsw %%xmm3,%%xmm1 \n" \
+ "paddsw %%xmm3,%%xmm2 \n" \
+ "psraw $0x6,%%xmm0 \n" \
+ "psraw $0x6,%%xmm1 \n" \
+ "psraw $0x6,%%xmm2 \n" \
+ "packuswb %%xmm0,%%xmm0 \n" \
+ "packuswb %%xmm1,%%xmm1 \n" \
+ "packuswb %%xmm2,%%xmm2 \n"
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf,
@@ -2005,7 +2261,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV444
YUVTORGB
@@ -2014,9 +2270,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[dst_argb]) \n"
- "movdqa %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2026,6 +2282,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2053,7 +2312,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
#endif
"sub %[u_buf],%[v_buf] \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2065,9 +2324,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm6,%%xmm1 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_rgb24]) \n"
- "movdqu %%xmm1,0x8(%[dst_rgb24]) \n"
- "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n"
+ "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
+ "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2081,6 +2340,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
[kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
#endif
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
@@ -2108,7 +2370,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
#endif
"sub %[u_buf],%[v_buf] \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2120,9 +2382,9 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm6,%%xmm1 \n"
"palignr $0xc,%%xmm0,%%xmm1 \n"
- "movq %%xmm0,(%[dst_raw]) \n"
- "movdqu %%xmm1,0x8(%[dst_raw]) \n"
- "lea 0x18(%[dst_raw]),%[dst_raw] \n"
+ "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
+ "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2136,6 +2398,9 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
[kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
#endif
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
@@ -2151,7 +2416,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2160,9 +2425,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[dst_argb]) \n"
- "movdqa %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2172,6 +2437,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2187,7 +2455,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV411
YUVTORGB
@@ -2196,9 +2464,9 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[dst_argb]) \n"
- "movdqa %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2208,6 +2476,9 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2221,7 +2492,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READNV12
YUVTORGB
@@ -2230,9 +2501,9 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[dst_argb]) \n"
- "movdqa %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2241,6 +2512,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+ // Does not use r14.
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2254,7 +2526,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READNV12
YVUTORGB
@@ -2263,9 +2535,9 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,(%[dst_argb]) \n"
- "movdqa %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2274,6 +2546,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+ // Does not use r14.
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2289,7 +2562,7 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV444
YUVTORGB
@@ -2298,9 +2571,9 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[dst_argb]) \n"
- "movdqu %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2310,6 +2583,9 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2325,7 +2601,7 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2334,9 +2610,9 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[dst_argb]) \n"
- "movdqu %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2346,6 +2622,9 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2361,7 +2640,7 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV411
YUVTORGB
@@ -2370,9 +2649,9 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[dst_argb]) \n"
- "movdqu %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2382,6 +2661,9 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2395,7 +2677,7 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READNV12
YUVTORGB
@@ -2404,9 +2686,9 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[dst_argb]) \n"
- "movdqu %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2415,6 +2697,7 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+ // Does not use r14.
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2428,7 +2711,7 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READNV12
YVUTORGB
@@ -2437,9 +2720,9 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm0,%%xmm1 \n"
"punpcklwd %%xmm2,%%xmm0 \n"
"punpckhwd %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%[dst_argb]) \n"
- "movdqu %%xmm1,0x10(%[dst_argb]) \n"
- "lea 0x20(%[dst_argb]),%[dst_argb] \n"
+ "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
+ "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2448,6 +2731,7 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+ // Does not use r14.
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2463,7 +2747,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2473,9 +2757,9 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5,(%[dst_bgra]) \n"
- "movdqa %%xmm0,0x10(%[dst_bgra]) \n"
- "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
+ "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
+ "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
+ "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2485,6 +2769,9 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2500,7 +2787,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2509,9 +2796,9 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
- "movdqa %%xmm2,(%[dst_abgr]) \n"
- "movdqa %%xmm1,0x10(%[dst_abgr]) \n"
- "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
+ "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
+ "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2521,6 +2808,9 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2536,7 +2826,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2546,9 +2836,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5,(%[dst_rgba]) \n"
- "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
- "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
+ "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
+ "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
+ "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2558,6 +2848,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2573,7 +2866,7 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2583,9 +2876,9 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
- "movdqu %%xmm5,(%[dst_bgra]) \n"
- "movdqu %%xmm0,0x10(%[dst_bgra]) \n"
- "lea 0x20(%[dst_bgra]),%[dst_bgra] \n"
+ "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
+ "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2595,6 +2888,9 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2610,7 +2906,7 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2619,9 +2915,9 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm2,%%xmm1 \n"
"punpcklwd %%xmm0,%%xmm2 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
- "movdqu %%xmm2,(%[dst_abgr]) \n"
- "movdqu %%xmm1,0x10(%[dst_abgr]) \n"
- "lea 0x20(%[dst_abgr]),%[dst_abgr] \n"
+ "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
+ "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2631,6 +2927,9 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2646,7 +2945,7 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pxor %%xmm4,%%xmm4 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
READYUV422
YUVTORGB
@@ -2656,9 +2955,9 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
"movdqa %%xmm5,%%xmm0 \n"
"punpcklwd %%xmm1,%%xmm5 \n"
"punpckhwd %%xmm1,%%xmm0 \n"
- "movdqa %%xmm5,(%[dst_rgba]) \n"
- "movdqa %%xmm0,0x10(%[dst_rgba]) \n"
- "lea 0x20(%[dst_rgba]),%[dst_rgba] \n"
+ "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
+ "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
+ "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
@@ -2668,6 +2967,9 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
[width]"+rm"(width) // %[width]
: [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -2690,11 +2992,12 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"mov $0x004a004a,%%eax \n"
"movd %%eax,%%xmm2 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
- "movq (%0),%%xmm0 \n"
- "lea 0x8(%0),%0 \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"psubusw %%xmm3,%%xmm0 \n"
"pmullw %%xmm2,%%xmm0 \n"
@@ -2708,9 +3011,9 @@ void YToARGBRow_SSE2(const uint8* y_buf,
"punpckhwd %%xmm1,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"por %%xmm4,%%xmm1 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,16(%1) \n"
- "lea 32(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
@@ -2728,7 +3031,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
#ifdef HAS_MIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
-CONST uvec8 kShuffleMirror = {
+static uvec8 kShuffleMirror = {
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
};
@@ -2736,20 +3039,24 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa %3,%%xmm5 \n"
- "lea -0x10(%0),%0 \n"
- ".p2align 4 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0,%2),%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
"pshufb %%xmm5,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
: "m"(kShuffleMirror) // %3
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm5"
#endif
@@ -2761,10 +3068,11 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
- "lea -0x10(%0),%0 \n"
- ".p2align 4 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0,%2),%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
"movdqa %%xmm0,%%xmm1 \n"
"psllw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
@@ -2773,14 +3081,17 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1)",%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(temp_width) // %2
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
@@ -2790,7 +3101,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
#ifdef HAS_MIRRORROW_UV_SSSE3
// Shuffle table for reversing the bytes of UV channels.
-CONST uvec8 kShuffleMirrorUV = {
+static uvec8 kShuffleMirrorUV = {
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
};
void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
@@ -2798,17 +3109,19 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
"movdqa %4,%%xmm1 \n"
- "lea -16(%0,%3,2),%0 \n"
+ "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "lea -16(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
"pshufb %%xmm1,%%xmm0 \n"
"sub $8,%3 \n"
- "movlpd %%xmm0,(%1) \n"
- "movhpd %%xmm0,(%1,%2) \n"
- "lea 8(%1),%1 \n"
+ "movlpd %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_u), // %1
@@ -2816,6 +3129,9 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
"+r"(temp_width) // %3
: "m"(kShuffleMirrorUV) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1"
#endif
@@ -2825,22 +3141,23 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
#ifdef HAS_ARGBMIRRORROW_SSSE3
// Shuffle table for reversing the bytes.
-CONST uvec8 kARGBShuffleMirror = {
+static uvec8 kARGBShuffleMirror = {
12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
};
void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
intptr_t temp_width = static_cast<intptr_t>(width);
asm volatile (
+ "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
"movdqa %3,%%xmm5 \n"
- "lea -0x10(%0),%0 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0,%2,4),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
"pshufb %%xmm5,%%xmm0 \n"
+ "lea " MEMLEA(-0x10,0) ",%0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
@@ -2860,11 +3177,12 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"pand %%xmm5,%%xmm0 \n"
@@ -2873,9 +3191,9 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm2,(%1,%2) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
@@ -2884,6 +3202,9 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
"+r"(pix) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
@@ -2896,11 +3217,12 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"pand %%xmm5,%%xmm0 \n"
@@ -2909,9 +3231,9 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm2,(%1,%2) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
@@ -2920,6 +3242,9 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
"+r"(pix) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
@@ -2932,17 +3257,18 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
"sub %0,%1 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm2 \n"
- "movdqa %%xmm0,(%2) \n"
- "movdqa %%xmm2,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_u), // %0
@@ -2951,6 +3277,9 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"+r"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
#endif
@@ -2961,17 +3290,18 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
uint8* dst_uv, int width) {
asm volatile (
"sub %0,%1 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm2 \n"
- "movdqu %%xmm0,(%2) \n"
- "movdqu %%xmm2,0x10(%2) \n"
- "lea 0x20(%2),%2 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "lea " MEMLEA(0x20,2) ",%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_u), // %0
@@ -2980,6 +3310,9 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
"+r"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2"
#endif
@@ -2990,14 +3323,14 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
#ifdef HAS_COPYROW_SSE2
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
asm volatile (
- "sub %0,%1 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa %%xmm0,(%0,%1) \n"
- "movdqa %%xmm1,0x10(%0,%1) \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
@@ -3017,7 +3350,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile (
"shr $0x2,%2 \n"
- "rep movsl \n"
+ "rep movsl " MEMMOVESTRING(0,1) " \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
@@ -3027,11 +3360,12 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
}
#endif // HAS_COPYROW_X86
+#ifdef HAS_COPYROW_ERMS
// Unaligned Multiple of 1.
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile (
- "rep movsb \n"
+ "rep movsb " MEMMOVESTRING(0,1) " \n"
: "+S"(src), // %0
"+D"(dst), // %1
"+c"(width_tmp) // %2
@@ -3039,13 +3373,156 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
: "memory", "cc"
);
}
+#endif // HAS_COPYROW_ERMS
+
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqa %%xmm2," MEMACCESS(1) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "pcmpeqb %%xmm0,%%xmm0 \n"
+ "pslld $0x18,%%xmm0 \n"
+ "pcmpeqb %%xmm1,%%xmm1 \n"
+ "psrld $0x8,%%xmm1 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm2,%%xmm2 \n"
+ "punpckhwd %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm2,%%xmm2 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm4 \n"
+ "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
+ "pand %%xmm0,%%xmm2 \n"
+ "pand %%xmm0,%%xmm3 \n"
+ "pand %%xmm1,%%xmm4 \n"
+ "pand %%xmm1,%%xmm5 \n"
+ "por %%xmm4,%%xmm2 \n"
+ "por %%xmm5,%%xmm3 \n"
+ "movdqa %%xmm2," MEMACCESS(1) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpsrld $0x8,%%ymm0,%%ymm0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
+ "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "vpslld $0x18,%%ymm1,%%ymm1 \n"
+ "vpslld $0x18,%%ymm2,%%ymm2 \n"
+ "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
+ "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
+ "vmovdqu %%ymm1," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2"
+#endif
+ );
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86
void SetRow_X86(uint8* dst, uint32 v32, int width) {
size_t width_tmp = static_cast<size_t>(width);
asm volatile (
"shr $0x2,%1 \n"
- "rep stosl \n"
+ "rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -3058,7 +3535,7 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
size_t width_tmp = static_cast<size_t>(width);
uint32* d = reinterpret_cast<uint32*>(dst);
asm volatile (
- "rep stosl \n"
+ "rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(d), // %0
"+c"(width_tmp) // %1
: "a"(v32) // %2
@@ -3073,16 +3550,17 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
@@ -3102,13 +3580,15 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%4,1),%%xmm2 \n"
- "movdqa 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
+ MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
@@ -3119,9 +3599,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
@@ -3130,6 +3611,9 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
@@ -3142,11 +3626,12 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
@@ -3155,9 +3640,10 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
@@ -3166,6 +3652,9 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
"+r"(pix) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
@@ -3177,17 +3666,18 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
asm volatile (
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
@@ -3207,13 +3697,15 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu (%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
@@ -3224,9 +3716,10 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
@@ -3235,6 +3728,9 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
@@ -3247,11 +3743,12 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
@@ -3260,9 +3757,10 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
@@ -3271,6 +3769,9 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
"+r"(pix) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
@@ -3279,17 +3780,18 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@@ -3308,13 +3810,15 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%4,1),%%xmm2 \n"
- "movdqa 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
+ MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
@@ -3325,9 +3829,10 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
@@ -3336,6 +3841,9 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
@@ -3348,11 +3856,12 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
@@ -3361,9 +3870,10 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
@@ -3372,6 +3882,9 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
"+r"(pix) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
@@ -3381,17 +3894,18 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) {
asm volatile (
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@@ -3410,13 +3924,15 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu (%0,%4,1),%%xmm2 \n"
- "movdqu 0x10(%0,%4,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
+ MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"pand %%xmm5,%%xmm0 \n"
@@ -3427,9 +3943,10 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
@@ -3438,6 +3955,9 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
"+r"(pix) // %3
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
@@ -3450,11 +3970,12 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrlw $0x8,%%xmm5 \n"
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pand %%xmm5,%%xmm0 \n"
"pand %%xmm5,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
@@ -3463,9 +3984,10 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
"packuswb %%xmm0,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm1 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,(%1,%2) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
@@ -3474,6 +3996,9 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
"+r"(pix) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm5"
#endif
@@ -3502,19 +4027,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"10: \n"
"test $0xf,%2 \n"
"je 19f \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -3523,8 +4048,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
"jge 10b \n"
"19: \n"
@@ -3534,19 +4059,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
// 4 pixel loop.
".p2align 2 \n"
"41: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -3555,8 +4080,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
"49: \n"
@@ -3565,19 +4090,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop.
"91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -3586,8 +4111,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
"jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
@@ -3605,7 +4130,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
#ifdef HAS_ARGBBLENDROW_SSSE3
// Shuffle table for isolating alpha.
-CONST uvec8 kShuffleAlpha = {
+static uvec8 kShuffleAlpha = {
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
};
@@ -3639,17 +4164,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"10: \n"
"test $0xf,%2 \n"
"je 19f \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -3658,8 +4183,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
"jge 10b \n"
"19: \n"
@@ -3673,17 +4198,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 4 pixel loop.
".p2align 2 \n"
"40: \n"
- "movdqa (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movdqa (%1),%%xmm2 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movdqa (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -3692,25 +4217,25 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"jge 40b \n"
"jmp 49f \n"
// 4 pixel unaligned loop.
".p2align 2 \n"
"41: \n"
- "movdqu (%0),%%xmm3 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movdqu (%1),%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movdqu (%1),%%xmm1 \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -3719,8 +4244,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"jge 41b \n"
"49: \n"
@@ -3729,17 +4254,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// 1 pixel loop.
"91: \n"
- "movd (%0),%%xmm3 \n"
- "lea 0x4(%0),%0 \n"
+ "movd " MEMACCESS(0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
"movdqa %%xmm3,%%xmm0 \n"
"pxor %%xmm4,%%xmm3 \n"
- "movd (%1),%%xmm2 \n"
+ "movd " MEMACCESS(1) ",%%xmm2 \n"
"pshufb %4,%%xmm3 \n"
"pand %%xmm6,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"pmullw %%xmm3,%%xmm2 \n"
- "movd (%1),%%xmm1 \n"
- "lea 0x4(%1),%1 \n"
+ "movd " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
"psrlw $0x8,%%xmm1 \n"
"por %%xmm4,%%xmm0 \n"
"pmullw %%xmm3,%%xmm1 \n"
@@ -3748,8 +4273,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
"pand %%xmm5,%%xmm1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x1,%3 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
"jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
@@ -3770,26 +4295,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
// aligned to 16 bytes
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
- "sub %0,%1 \n"
"pcmpeqb %%xmm4,%%xmm4 \n"
"pslld $0x18,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x8,%%xmm5 \n"
// 4 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"pshufhw $0xff,%%xmm0,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
"pshufhw $0xff,%%xmm1,%%xmm2 \n"
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"psrlw $0x8,%%xmm0 \n"
"pand %%xmm4,%%xmm2 \n"
"psrlw $0x8,%%xmm1 \n"
@@ -3797,8 +4322,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
"pand %%xmm5,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -3814,10 +4339,10 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
-CONST uvec8 kShuffleAlpha0 = {
+static uvec8 kShuffleAlpha0 = {
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
};
-CONST uvec8 kShuffleAlpha1 = {
+static uvec8 kShuffleAlpha1 = {
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
};
@@ -3825,34 +4350,34 @@ CONST uvec8 kShuffleAlpha1 = {
// aligned to 16 bytes
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
- "sub %0,%1 \n"
"pcmpeqb %%xmm3,%%xmm3 \n"
"pslld $0x18,%%xmm3 \n"
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
// 4 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"punpcklbw %%xmm1,%%xmm1 \n"
"pmulhuw %%xmm1,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
"pshufb %%xmm5,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
"punpckhbw %%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
- "movdqa (%0),%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"pand %%xmm3,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -3874,35 +4399,34 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
int width) {
uintptr_t alpha = 0;
asm volatile (
- "sub %0,%1 \n"
-
// 4 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movzb 0x3(%0),%3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movzb " MEMACCESS2(0x03,0) ",%3 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
- "movd 0x0(%4,%3,4),%%xmm2 \n"
- "movzb 0x7(%0),%3 \n"
- "movd 0x0(%4,%3,4),%%xmm3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x07,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
"pshuflw $0x40,%%xmm2,%%xmm2 \n"
"pshuflw $0x40,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
- "movzb 0xb(%0),%3 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
- "movd 0x0(%4,%3,4),%%xmm2 \n"
- "movzb 0xf(%0),%3 \n"
- "movd 0x0(%4,%3,4),%%xmm3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
+ "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
+ MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
"pshuflw $0x40,%%xmm2,%%xmm2 \n"
"pshuflw $0x40,%%xmm3,%%xmm3 \n"
"movlhps %%xmm3,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -3910,6 +4434,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
"+r"(alpha) // %3
: "r"(fixed_invtbl8) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -3923,21 +4450,21 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
- "sub %0,%1 \n"
// 8 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n"
"paddw %%xmm5,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "movdqa (%0),%%xmm2 \n"
- "movdqa 0x10(%0),%%xmm3 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"psrld $0x18,%%xmm2 \n"
"psrld $0x18,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
@@ -3949,9 +4476,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
"punpcklwd %%xmm3,%%xmm0 \n"
"punpckhwd %%xmm3,%%xmm1 \n"
"sub $0x8,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "movdqa %%xmm1,0x10(%0,%1,1) \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -3971,15 +4498,15 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
// Constant for ARGB color to sepia tone
-CONST vec8 kARGBToSepiaB = {
+static vec8 kARGBToSepiaB = {
17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
};
-CONST vec8 kARGBToSepiaG = {
+static vec8 kARGBToSepiaG = {
22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
};
-CONST vec8 kARGBToSepiaR = {
+static vec8 kARGBToSepiaR = {
24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
};
@@ -3991,32 +4518,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"movdqa %4,%%xmm4 \n"
// 8 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm6 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
"pmaddubsw %%xmm2,%%xmm6 \n"
"phaddw %%xmm6,%%xmm0 \n"
"psrlw $0x7,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm3,%%xmm5 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm5 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm5 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"phaddw %%xmm1,%%xmm5 \n"
"psrlw $0x7,%%xmm5 \n"
"packuswb %%xmm5,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
"psrld $0x18,%%xmm6 \n"
"psrld $0x18,%%xmm1 \n"
"packuswb %%xmm1,%%xmm6 \n"
@@ -4026,9 +4553,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
"punpcklwd %%xmm5,%%xmm0 \n"
"punpckhwd %%xmm5,%%xmm1 \n"
"sub $0x8,%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "movdqa %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@@ -4046,62 +4573,64 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
- int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
asm volatile (
- "movd (%2),%%xmm2 \n"
- "movd 0x4(%2),%%xmm3 \n"
- "movd 0x8(%2),%%xmm4 \n"
- "pshufd $0x0,%%xmm2,%%xmm2 \n"
- "pshufd $0x0,%%xmm3,%%xmm3 \n"
- "pshufd $0x0,%%xmm4,%%xmm4 \n"
+ "movdqu " MEMACCESS(3) ",%%xmm5 \n"
+ "pshufd $0x00,%%xmm5,%%xmm2 \n"
+ "pshufd $0x55,%%xmm5,%%xmm3 \n"
+ "pshufd $0xaa,%%xmm5,%%xmm4 \n"
+ "pshufd $0xff,%%xmm5,%%xmm5 \n"
// 8 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm6 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm2,%%xmm0 \n"
- "pmaddubsw %%xmm2,%%xmm6 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm3,%%xmm5 \n"
+ "pmaddubsw %%xmm2,%%xmm7 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "pmaddubsw %%xmm3,%%xmm6 \n"
"pmaddubsw %%xmm3,%%xmm1 \n"
- "phaddsw %%xmm6,%%xmm0 \n"
- "phaddsw %%xmm1,%%xmm5 \n"
- "psraw $0x7,%%xmm0 \n"
- "psraw $0x7,%%xmm5 \n"
+ "phaddsw %%xmm7,%%xmm0 \n"
+ "phaddsw %%xmm1,%%xmm6 \n"
+ "psraw $0x6,%%xmm0 \n"
+ "psraw $0x6,%%xmm6 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "movdqa (%0),%%xmm5 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "pmaddubsw %%xmm4,%%xmm5 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "punpcklbw %%xmm6,%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
- "phaddsw %%xmm1,%%xmm5 \n"
- "psraw $0x7,%%xmm5 \n"
- "packuswb %%xmm5,%%xmm5 \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "psrld $0x18,%%xmm6 \n"
- "psrld $0x18,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm6 \n"
+ "pmaddubsw %%xmm4,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm7 \n"
+ "phaddsw %%xmm7,%%xmm6 \n"
+ "psraw $0x6,%%xmm1 \n"
+ "psraw $0x6,%%xmm6 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
"packuswb %%xmm6,%%xmm6 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm6,%%xmm5 \n"
- "punpcklwd %%xmm5,%%xmm0 \n"
- "punpckhwd %%xmm5,%%xmm1 \n"
- "sub $0x8,%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "movdqa %%xmm1,0x10(%0) \n"
- "lea 0x20(%0),%0 \n"
+ "punpcklbw %%xmm6,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "punpcklwd %%xmm1,%%xmm0 \n"
+ "punpckhwd %%xmm1,%%xmm6 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
- : "+r"(dst_argb), // %0
- "+r"(width) // %1
- : "r"(matrix_argb) // %2
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
: "memory", "cc"
#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
);
}
@@ -4129,14 +4658,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
// 4 pixel loop.
".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"pmulhuw %%xmm2,%%xmm0 \n"
- "movdqa (%0),%%xmm1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm1 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"pmullw %%xmm3,%%xmm0 \n"
- "movdqa (%0),%%xmm7 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm7 \n"
"pmullw %%xmm3,%%xmm1 \n"
"pand %%xmm6,%%xmm7 \n"
"paddw %%xmm4,%%xmm0 \n"
@@ -4144,8 +4673,8 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm7,%%xmm0 \n"
"sub $0x4,%1 \n"
- "movdqa %%xmm0,(%0) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(width) // %1
@@ -4167,14 +4696,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
asm volatile (
"movd %3,%%xmm2 \n"
- "sub %0,%1 \n"
"punpcklbw %%xmm2,%%xmm2 \n"
"punpcklqdq %%xmm2,%%xmm2 \n"
// 4 pixel loop.
".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm1 \n"
@@ -4184,8 +4713,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%2 \n"
- "movdqa %%xmm0,(%0,%1,1) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -4205,14 +4734,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
"pxor %%xmm5,%%xmm5 \n"
- "sub %0,%1 \n"
- "sub %0,%2 \n"
// 4 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%1),%%xmm2 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"movdqu %%xmm0,%%xmm1 \n"
"movdqu %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm0,%%xmm0 \n"
@@ -4223,8 +4752,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
"pmulhuw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqu %%xmm0,(%0,%2,1) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4244,18 +4773,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
- "sub %0,%1 \n"
- "sub %0,%2 \n"
-
// 4 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%1),%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"paddusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqu %%xmm0,(%0,%2,1) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4275,18 +4803,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
- "sub %0,%1 \n"
- "sub %0,%2 \n"
-
// 4 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu (%0,%1),%%xmm1 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"psubusb %%xmm1,%%xmm0 \n"
"sub $0x4,%3 \n"
- "movdqu %%xmm0,(%0,%2,1) \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
"+r"(src_argb1), // %1
@@ -4301,13 +4828,13 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
}
#endif // HAS_ARGBSUBTRACTROW_SSE2
-#ifdef HAS_SOBELXROW_SSSE3
+#ifdef HAS_SOBELXROW_SSE2
// SobelX as a matrix is
// -1 0 1
// -2 0 2
// -1 0 1
-void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile (
"sub %0,%1 \n"
"sub %0,%2 \n"
@@ -4315,31 +4842,37 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
"pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq 0x2(%0),%%xmm1 \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
"punpcklbw %%xmm5,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"psubw %%xmm1,%%xmm0 \n"
- "movq (%0,%1,1),%%xmm1 \n"
- "movq 0x2(%0,%1,1),%%xmm2 \n"
+ BUNDLEALIGN
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
+ MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
"punpcklbw %%xmm5,%%xmm1 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"psubw %%xmm2,%%xmm1 \n"
- "movq (%0,%2,1),%%xmm2 \n"
- "movq 0x2(%0,%2,1),%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
+ MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
"punpcklbw %%xmm5,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm3 \n"
"psubw %%xmm3,%%xmm2 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm1,%%xmm0 \n"
"paddw %%xmm1,%%xmm0 \n"
- "pabsw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"sub $0x8,%4 \n"
- "movq %%xmm0,(%0,%3,1) \n"
- "lea 0x8(%0),%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
"jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -4348,51 +4881,60 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %4
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
-#endif // HAS_SOBELXROW_SSSE3
+#endif // HAS_SOBELXROW_SSE2
-#ifdef HAS_SOBELYROW_SSSE3
+#ifdef HAS_SOBELYROW_SSE2
// SobelY as a matrix is
// -1 -2 -1
// 0 0 0
// 1 2 1
-void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
asm volatile (
"sub %0,%1 \n"
"sub %0,%2 \n"
"pxor %%xmm5,%%xmm5 \n"
// 8 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movq (%0),%%xmm0 \n"
- "movq (%0,%1,1),%%xmm1 \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
"punpcklbw %%xmm5,%%xmm0 \n"
"punpcklbw %%xmm5,%%xmm1 \n"
"psubw %%xmm1,%%xmm0 \n"
- "movq 0x1(%0),%%xmm1 \n"
- "movq 0x1(%0,%1,1),%%xmm2 \n"
+ BUNDLEALIGN
+ "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
+ MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
"punpcklbw %%xmm5,%%xmm1 \n"
"punpcklbw %%xmm5,%%xmm2 \n"
"psubw %%xmm2,%%xmm1 \n"
- "movq 0x2(%0),%%xmm2 \n"
- "movq 0x2(%0,%1,1),%%xmm3 \n"
+ BUNDLEALIGN
+ "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
+ MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
"punpcklbw %%xmm5,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm3 \n"
"psubw %%xmm3,%%xmm2 \n"
"paddw %%xmm2,%%xmm0 \n"
"paddw %%xmm1,%%xmm0 \n"
"paddw %%xmm1,%%xmm0 \n"
- "pabsw %%xmm0,%%xmm0 \n"
+ "pxor %%xmm1,%%xmm1 \n"
+ "psubw %%xmm0,%%xmm1 \n"
+ "pmaxsw %%xmm1,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
"sub $0x8,%3 \n"
- "movq %%xmm0,(%0,%2,1) \n"
- "lea 0x8(%0),%0 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
+ "lea " MEMLEA(0x8,0) ",%0 \n"
"jg 1b \n"
: "+r"(src_y0), // %0
"+r"(src_y1), // %1
@@ -4400,12 +4942,15 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
"+r"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
);
}
-#endif // HAS_SOBELYROW_SSSE3
+#endif // HAS_SOBELYROW_SSE2
#ifdef HAS_SOBELROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -4414,18 +4959,19 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
// G = Sobel
// B = Sobel
void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+ uint8* dst_argb, int width) {
asm volatile (
"sub %0,%1 \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
"pslld $0x18,%%xmm5 \n"
// 8 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"paddusb %%xmm1,%%xmm0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm0,%%xmm2 \n"
@@ -4441,11 +4987,11 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"por %%xmm5,%%xmm3 \n"
"por %%xmm5,%%xmm0 \n"
"sub $0x10,%3 \n"
- "movdqa %%xmm1,(%2) \n"
- "movdqa %%xmm2,0x10(%2) \n"
- "movdqa %%xmm3,0x20(%2) \n"
- "movdqa %%xmm0,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
+ "movdqa %%xmm1," MEMACCESS(2) " \n"
+ "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
+ "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@@ -4453,6 +4999,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"+r"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif
@@ -4460,6 +5009,43 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
}
#endif // HAS_SOBELROW_SSE2
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ "sub %0,%1 \n"
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "pslld $0x18,%%xmm5 \n"
+
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "sub $0x10,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
#ifdef HAS_SOBELXYROW_SSE2
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
@@ -4473,11 +5059,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"pcmpeqb %%xmm5,%%xmm5 \n"
// 8 pixel loop.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%1,1),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm2 \n"
"paddusb %%xmm1,%%xmm2 \n"
"movdqa %%xmm0,%%xmm3 \n"
@@ -4493,11 +5080,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"punpcklwd %%xmm0,%%xmm7 \n"
"punpckhwd %%xmm0,%%xmm1 \n"
"sub $0x10,%3 \n"
- "movdqa %%xmm6,(%2) \n"
- "movdqa %%xmm4,0x10(%2) \n"
- "movdqa %%xmm7,0x20(%2) \n"
- "movdqa %%xmm1,0x30(%2) \n"
- "lea 0x40(%2),%2 \n"
+ "movdqa %%xmm6," MEMACCESS(2) " \n"
+ "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
+ "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
"jg 1b \n"
: "+r"(src_sobelx), // %0
"+r"(src_sobely), // %1
@@ -4505,6 +5092,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
"+r"(width) // %3
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
@@ -4518,7 +5108,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
const int32* previous_cumsum, int width) {
asm volatile (
- "sub %1,%2 \n"
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n"
"sub $0x4,%3 \n"
@@ -4529,8 +5118,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
// 4 pixel loop \n"
".p2align 2 \n"
"40: \n"
- "movdqu (%0),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm2,%%xmm4 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n"
@@ -4541,22 +5130,23 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
"punpcklwd %%xmm1,%%xmm4 \n"
"punpckhwd %%xmm1,%%xmm5 \n"
"paddd %%xmm2,%%xmm0 \n"
- "movdqa (%1,%2,1),%%xmm2 \n"
+ "movdqa " MEMACCESS(2) ",%%xmm2 \n"
"paddd %%xmm0,%%xmm2 \n"
"paddd %%xmm3,%%xmm0 \n"
- "movdqa 0x10(%1,%2,1),%%xmm3 \n"
+ "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
"paddd %%xmm0,%%xmm3 \n"
"paddd %%xmm4,%%xmm0 \n"
- "movdqa 0x20(%1,%2,1),%%xmm4 \n"
+ "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
"paddd %%xmm0,%%xmm4 \n"
"paddd %%xmm5,%%xmm0 \n"
- "movdqa 0x30(%1,%2,1),%%xmm5 \n"
+ "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
+ "lea " MEMLEA(0x40,2) ",%2 \n"
"paddd %%xmm0,%%xmm5 \n"
- "movdqa %%xmm2,(%1) \n"
- "movdqa %%xmm3,0x10(%1) \n"
- "movdqa %%xmm4,0x20(%1) \n"
- "movdqa %%xmm5,0x30(%1) \n"
- "lea 0x40(%1),%1 \n"
+ "movdqa %%xmm2," MEMACCESS(1) " \n"
+ "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
+ "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
+ "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
@@ -4567,15 +5157,16 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
// 1 pixel loop \n"
".p2align 2 \n"
"10: \n"
- "movd (%0),%%xmm2 \n"
- "lea 0x4(%0),%0 \n"
+ "movd " MEMACCESS(0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
"punpcklbw %%xmm1,%%xmm2 \n"
"punpcklwd %%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm0 \n"
- "movdqu (%1,%2,1),%%xmm2 \n"
+ "movdqu " MEMACCESS(2) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"paddd %%xmm0,%%xmm2 \n"
- "movdqu %%xmm2,(%1) \n"
- "lea 0x10(%1),%1 \n"
+ "movdqu %%xmm2," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
@@ -4598,34 +5189,83 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
int width, int area, uint8* dst,
int count) {
asm volatile (
- "movd %5,%%xmm4 \n"
- "cvtdq2ps %%xmm4,%%xmm4 \n"
- "rcpss %%xmm4,%%xmm4 \n"
+ "movd %5,%%xmm5 \n"
+ "cvtdq2ps %%xmm5,%%xmm5 \n"
+ "rcpss %%xmm5,%%xmm4 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"sub $0x4,%3 \n"
"jl 49f \n"
+ "cmpl $0x80,%5 \n"
+ "ja 40f \n"
+
+ "pshufd $0x0,%%xmm5,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrld $0x10,%%xmm6 \n"
+ "cvtdq2ps %%xmm6,%%xmm6 \n"
+ "addps %%xmm6,%%xmm5 \n"
+ "mulps %%xmm4,%%xmm5 \n"
+ "cvtps2dq %%xmm5,%%xmm5 \n"
+ "packssdw %%xmm5,%%xmm5 \n"
+
+ // 4 pixel small loop \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "4: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packssdw %%xmm3,%%xmm2 \n"
+ "pmulhuw %%xmm5,%%xmm0 \n"
+ "pmulhuw %%xmm5,%%xmm2 \n"
+ "packuswb %%xmm2,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "sub $0x4,%3 \n"
+ "jge 4b \n"
+ "jmp 49f \n"
// 4 pixel loop \n"
".p2align 2 \n"
+ BUNDLEALIGN
"40: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa 0x20(%0),%%xmm2 \n"
- "movdqa 0x30(%0),%%xmm3 \n"
- "psubd (%0,%4,4),%%xmm0 \n"
- "psubd 0x10(%0,%4,4),%%xmm1 \n"
- "psubd 0x20(%0,%4,4),%%xmm2 \n"
- "psubd 0x30(%0,%4,4),%%xmm3 \n"
- "lea 0x40(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "psubd 0x10(%1),%%xmm1 \n"
- "psubd 0x20(%1),%%xmm2 \n"
- "psubd 0x30(%1),%%xmm3 \n"
- "paddd (%1,%4,4),%%xmm0 \n"
- "paddd 0x10(%1,%4,4),%%xmm1 \n"
- "paddd 0x20(%1,%4,4),%%xmm2 \n"
- "paddd 0x30(%1,%4,4),%%xmm3 \n"
- "lea 0x40(%1),%1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
+ "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
+ MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
+ MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
+ "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
+ "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
+ BUNDLEALIGN
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
+ MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
+ MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
+ "lea " MEMLEA(0x40,1) ",%1 \n"
"cvtdq2ps %%xmm0,%%xmm0 \n"
"cvtdq2ps %%xmm1,%%xmm1 \n"
"mulps %%xmm4,%%xmm0 \n"
@@ -4641,8 +5281,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
"packssdw %%xmm1,%%xmm0 \n"
"packssdw %%xmm3,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
- "movdqu %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"sub $0x4,%3 \n"
"jge 40b \n"
@@ -4652,20 +5292,22 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
// 1 pixel loop \n"
".p2align 2 \n"
+ BUNDLEALIGN
"10: \n"
- "movdqa (%0),%%xmm0 \n"
- "psubd (%0,%4,4),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "psubd (%1),%%xmm0 \n"
- "paddd (%1,%4,4),%%xmm0 \n"
- "lea 0x10(%1),%1 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "psubd " MEMACCESS(1) ",%%xmm0 \n"
+ BUNDLEALIGN
+ MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"cvtdq2ps %%xmm0,%%xmm0 \n"
"mulps %%xmm4,%%xmm0 \n"
"cvtps2dq %%xmm0,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x4,2) ",%2 \n"
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
@@ -4676,27 +5318,26 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
: "r"(static_cast<intptr_t>(width)), // %4
"rm"(area) // %5
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
#endif
);
}
#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
#ifdef HAS_ARGBAFFINEROW_SSE2
-// TODO(fbarchard): Find 64 bit way to avoid masking.
// Copy ARGB pixels from source image with slope to a row of destination.
-// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
-// an error if movq is used. movd %%xmm0,%1
-
LIBYUV_API
void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
uint8* dst_argb, const float* src_dudv, int width) {
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp = 0;
asm volatile (
- "movq (%3),%%xmm2 \n"
- "movq 0x8(%3),%%xmm7 \n"
+ "movq " MEMACCESS(3) ",%%xmm2 \n"
+ "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
"shl $0x10,%1 \n"
"add $0x4,%1 \n"
"movd %1,%%xmm5 \n"
@@ -4715,46 +5356,31 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"addps %%xmm4,%%xmm4 \n"
// 4 pixel loop \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"40: \n"
- "cvttps2dq %%xmm2,%%xmm0 \n"
- "cvttps2dq %%xmm3,%%xmm1 \n"
- "packssdw %%xmm1,%%xmm0 \n"
- "pmaddwd %%xmm5,%%xmm0 \n"
-#if defined(__x86_64__)
- "movd %%xmm0,%1 \n"
- "mov %1,%5 \n"
- "and $0x0fffffff,%1 \n"
- "shr $32,%5 \n"
- "pshufd $0xEE,%%xmm0,%%xmm0 \n"
-#else
- "movd %%xmm0,%1 \n"
+ "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
+ "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
+ "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
+ "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
+ "movd %%xmm0,%k1 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%5 \n"
+ "movd %%xmm0,%k5 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
-#endif
- "movd (%0,%1,1),%%xmm1 \n"
- "movd (%0,%5,1),%%xmm6 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
"punpckldq %%xmm6,%%xmm1 \n"
"addps %%xmm4,%%xmm2 \n"
- "movq %%xmm1,(%2) \n"
-#if defined(__x86_64__)
- "movd %%xmm0,%1 \n"
- "mov %1,%5 \n"
- "and $0x0fffffff,%1 \n"
- "shr $32,%5 \n"
-#else
- "movd %%xmm0,%1 \n"
+ "movq %%xmm1," MEMACCESS(2) " \n"
+ "movd %%xmm0,%k1 \n"
"pshufd $0x39,%%xmm0,%%xmm0 \n"
- "movd %%xmm0,%5 \n"
-#endif
- "movd (%0,%1,1),%%xmm0 \n"
- "movd (%0,%5,1),%%xmm6 \n"
+ "movd %%xmm0,%k5 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
+ MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
"punpckldq %%xmm6,%%xmm0 \n"
"addps %%xmm4,%%xmm3 \n"
"sub $0x4,%4 \n"
- "movq %%xmm0,0x08(%2) \n"
- "lea 0x10(%2),%2 \n"
+ "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
"jge 40b \n"
"49: \n"
@@ -4762,20 +5388,18 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"jl 19f \n"
// 1 pixel loop \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"10: \n"
"cvttps2dq %%xmm2,%%xmm0 \n"
"packssdw %%xmm0,%%xmm0 \n"
"pmaddwd %%xmm5,%%xmm0 \n"
"addps %%xmm7,%%xmm2 \n"
- "movd %%xmm0,%1 \n"
-#if defined(__x86_64__)
- "and $0x0fffffff,%1 \n"
-#endif
- "movd (%0,%1,1),%%xmm0 \n"
+ "movd %%xmm0,%k1 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
"sub $0x1,%4 \n"
- "movd %%xmm0,(%2) \n"
- "lea 0x4(%2),%2 \n"
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x04,2) ",%2 \n"
"jge 10b \n"
"19: \n"
: "+r"(src_argb), // %0
@@ -4786,6 +5410,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
"+r"(temp) // %5
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
#endif
@@ -4793,6 +5420,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
}
#endif // HAS_ARGBAFFINEROW_SSE2
+#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
@@ -4818,10 +5446,11 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm2)
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
@@ -4831,56 +5460,64 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
"jmp 99f \n"
// Blend 25 / 75.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"25: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm1 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"50: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm1 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"75: \n"
- "movdqa (%1),%%xmm1 \n"
- "movdqa (%1,%4,1),%%xmm0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm0)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"100: \n"
- "movdqa (%1),%%xmm0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 100b \n"
"99: \n"
@@ -4890,11 +5527,15 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm5"
#endif
);
}
+#endif // HAS_INTERPOLATEROW_SSSE3
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
@@ -4923,10 +5564,11 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"pxor %%xmm4,%%xmm4 \n"
// General purpose row blend.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm2 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2
"movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
@@ -4943,56 +5585,64 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"paddw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
"jmp 99f \n"
// Blend 25 / 75.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"25: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm1 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"50: \n"
- "movdqa (%1),%%xmm0 \n"
- "movdqa (%1,%4,1),%%xmm1 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"75: \n"
- "movdqa (%1),%%xmm1 \n"
- "movdqa (%1,%4,1),%%xmm0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"100: \n"
- "movdqa (%1),%%xmm0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 100b \n"
"99: \n"
@@ -5002,6 +5652,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -5009,6 +5662,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_SSE2
+#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ptrdiff_t src_stride, int dst_width,
@@ -5034,10 +5688,11 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"pshufd $0x0,%%xmm5,%%xmm5 \n"
// General purpose row blend.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu (%1,%4,1),%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2)
"movdqu %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
@@ -5047,56 +5702,64 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"psrlw $0x7,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
"jmp 99f \n"
// Blend 25 / 75.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"25: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu (%1,%4,1),%%xmm1 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"50: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu (%1,%4,1),%%xmm1 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1)
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"75: \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%1,%4,1),%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm0)
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"100: \n"
- "movdqu (%1),%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 100b \n"
"99: \n"
@@ -5106,11 +5769,15 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
"+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm5"
#endif
);
}
+#endif // HAS_INTERPOLATEROW_SSSE3
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
@@ -5139,10 +5806,11 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"pxor %%xmm4,%%xmm4 \n"
// General purpose row blend.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu (%1,%4,1),%%xmm2 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
"movdqu %%xmm0,%%xmm1 \n"
"movdqu %%xmm2,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
@@ -5159,56 +5827,64 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"paddw %%xmm3,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 1b \n"
"jmp 99f \n"
// Blend 25 / 75.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"25: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu (%1,%4,1),%%xmm1 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 25b \n"
"jmp 99f \n"
// Blend 50 / 50.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"50: \n"
- "movdqu (%1),%%xmm0 \n"
- "movdqu (%1,%4,1),%%xmm1 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 50b \n"
"jmp 99f \n"
// Blend 75 / 25.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"75: \n"
- "movdqu (%1),%%xmm1 \n"
- "movdqu (%1,%4,1),%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
"pavgb %%xmm1,%%xmm0 \n"
"pavgb %%xmm1,%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ BUNDLEALIGN
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 75b \n"
"jmp 99f \n"
// Blend 100 / 0 - Copy row unchanged.
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"100: \n"
- "movdqu (%1),%%xmm0 \n"
+ "movdqu " MEMACCESS(1) ",%%xmm0 \n"
"sub $0x10,%2 \n"
- "movdqu %%xmm0,(%1,%0,1) \n"
- "lea 0x10(%1),%1 \n"
+ MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
+ "lea " MEMLEA(0x10,1) ",%1 \n"
"jg 100b \n"
"99: \n"
@@ -5218,6 +5894,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
"+r"(source_y_fraction) // %3
: "r"(static_cast<intptr_t>(src_stride)) // %4
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
#endif
@@ -5225,17 +5904,19 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
}
#endif // HAS_INTERPOLATEROW_SSE2
+#ifdef HAS_HALFROW_SSE2
void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
uint8* dst_uv, int pix) {
asm volatile (
"sub %0,%1 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pavgb (%0,%3),%%xmm0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0
"sub $0x10,%2 \n"
- "movdqa %%xmm0,(%0,%1) \n"
- "lea 0x10(%0),%0 \n"
+ MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1)
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"jg 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_uv), // %1
@@ -5247,23 +5928,26 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
#endif
);
}
+#endif // HAS_HALFROW_SSE2
+#ifdef HAS_ARGBTOBAYERROW_SSSE3
void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
uint32 selector, int pix) {
asm volatile (
+ // NaCL caveat - assumes movd is from GPR
"movd %3,%%xmm5 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"punpckldq %%xmm1,%%xmm0 \n"
"sub $0x8,%2 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_bayer), // %1
@@ -5275,23 +5959,58 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
#endif
);
}
+#endif // HAS_ARGBTOBAYERROW_SSSE3
+
+#ifdef HAS_ARGBTOBAYERGGROW_SSE2
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrld $0x8,%%xmm0 \n"
+ "psrld $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packssdw %%xmm1,%%xmm0 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBTOBAYERGGROW_SSE2
+#ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
- "movdqa (%3),%%xmm5 \n"
- ".p2align 4 \n"
+ "movdqa " MEMACCESS(3) ",%%xmm5 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -5307,18 +6026,48 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
- "movdqa (%3),%%xmm5 \n"
- ".p2align 4 \n"
+ "movdqa " MEMACCESS(3) ",%%xmm5 \n"
+ ".p2align 2 \n"
"1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
"pshufb %%xmm5,%%xmm0 \n"
"pshufb %%xmm5,%%xmm1 \n"
"sub $0x8,%2 \n"
- "movdqu %%xmm0,(%1) \n"
- "movdqu %%xmm1,0x10(%1) \n"
- "lea 0x20(%1),%1 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSSE3
+
+#ifdef HAS_ARGBSHUFFLEROW_AVX2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
+ "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
+ "lea " MEMLEA(0x40,0) ",%0 \n"
+ "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
+ "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
+ "sub $0x10,%2 \n"
+ "vmovdqu %%ymm0," MEMACCESS(1) " \n"
+ "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
+ "lea " MEMLEA(0x40,1) ",%1 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
@@ -5330,27 +6079,156 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
#endif
);
}
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+#ifdef HAS_ARGBSHUFFLEROW_SSE2
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ uintptr_t pixel_temp = 0u;
+ asm volatile (
+ "pxor %%xmm5,%%xmm5 \n"
+ "mov " MEMACCESS(4) ",%k2 \n"
+ "cmp $0x3000102,%k2 \n"
+ "je 3012f \n"
+ "cmp $0x10203,%k2 \n"
+ "je 123f \n"
+ "cmp $0x30201,%k2 \n"
+ "je 321f \n"
+ "cmp $0x2010003,%k2 \n"
+ "je 2103f \n"
+
+ BUNDLEALIGN
+ "1: \n"
+ "movzb " MEMACCESS(4) ",%2 \n"
+ MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS(1) " \n"
+ "movzb " MEMACCESS2(0x1,4) ",%2 \n"
+ MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x1,1) " \n"
+ BUNDLEALIGN
+ "movzb " MEMACCESS2(0x2,4) ",%2 \n"
+ MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x2,1) " \n"
+ "movzb " MEMACCESS2(0x3,4) ",%2 \n"
+ MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2
+ "mov %b2," MEMACCESS2(0x3,1) " \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ "lea " MEMLEA(0x4,1) ",%1 \n"
+ "sub $0x1,%3 \n"
+ "jg 1b \n"
+ "jmp 99f \n"
+
+ ".p2align 2 \n"
+ "123: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 123b \n"
+ "jmp 99f \n"
+
+ ".p2align 2 \n"
+ "321: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x39,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x39,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x39,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x39,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 321b \n"
+ "jmp 99f \n"
+
+ ".p2align 2 \n"
+ "2103: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0x93,%%xmm0,%%xmm0 \n"
+ "pshuflw $0x93,%%xmm0,%%xmm0 \n"
+ "pshufhw $0x93,%%xmm1,%%xmm1 \n"
+ "pshuflw $0x93,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 2103b \n"
+ "jmp 99f \n"
+
+ ".p2align 2 \n"
+ "3012: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpckhbw %%xmm5,%%xmm1 \n"
+ "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
+ "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
+ "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 3012b \n"
+
+ "99: \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+d"(pixel_temp), // %2
+ "+r"(pix) // %3
+ : "r"(shuffler) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_SSE2
+
+#ifdef HAS_I422TOYUY2ROW_SSE2
void I422ToYUY2Row_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_frame, int width) {
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movq (%1),%%xmm2 \n"
- "movq (%1,%2,1),%%xmm3 \n"
- "lea 0x8(%1),%1 \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"punpcklbw %%xmm2,%%xmm0 \n"
"punpckhbw %%xmm2,%%xmm1 \n"
- "movdqu %%xmm0,(%3) \n"
- "movdqu %%xmm1,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
+ "movdqu %%xmm0," MEMACCESS(3) " \n"
+ "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_y), // %0
@@ -5360,32 +6238,38 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
"+rm"(width) // %4
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
);
}
+#endif // HAS_I422TOYUY2ROW_SSE2
+#ifdef HAS_I422TOUYVYROW_SSE2
void I422ToUYVYRow_SSE2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_frame, int width) {
asm volatile (
"sub %1,%2 \n"
- ".p2align 4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
"1: \n"
- "movq (%1),%%xmm2 \n"
- "movq (%1,%2,1),%%xmm3 \n"
- "lea 0x8(%1),%1 \n"
+ "movq " MEMACCESS(1) ",%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
+ "lea " MEMLEA(0x8,1) ",%1 \n"
"punpcklbw %%xmm3,%%xmm2 \n"
- "movdqu (%0),%%xmm0 \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
"movdqa %%xmm2,%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
+ "lea " MEMLEA(0x10,0) ",%0 \n"
"punpcklbw %%xmm0,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm2 \n"
- "movdqu %%xmm1,(%3) \n"
- "movdqu %%xmm2,0x10(%3) \n"
- "lea 0x20(%3),%3 \n"
+ "movdqu %%xmm1," MEMACCESS(3) " \n"
+ "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
+ "lea " MEMLEA(0x20,3) ",%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_y), // %0
@@ -5395,11 +6279,300 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
"+rm"(width) // %4
:
: "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
#if defined(__SSE2__)
, "xmm0", "xmm1", "xmm2", "xmm3"
#endif
);
}
+#endif // HAS_I422TOUYVYROW_SSE2
+
+#ifdef HAS_FIXEDDIV_X86
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+ asm volatile (
+ "cdq \n"
+ "shld $0x10,%%eax,%%edx \n"
+ "shl $0x10,%%eax \n"
+ "idiv %1 \n"
+ "mov %0, %%eax \n"
+ : "+a"(num) // %0
+ : "c"(div) // %1
+ : "memory", "cc", "edx"
+ );
+ return num;
+}
+#endif // HAS_FIXEDDIV_X86
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ asm volatile (
+ "pxor %%xmm3,%%xmm3 \n"
+
+ // 2 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "punpcklbw %%xmm3,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "punpcklwd %%xmm3,%%xmm0 \n"
+ "punpckhwd %%xmm3,%%xmm4 \n"
+ "cvtdq2ps %%xmm0,%%xmm0 \n"
+ "cvtdq2ps %%xmm4,%%xmm4 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
+ "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
+ "addps " MEMACCESS(3) ",%%xmm0 \n"
+ "addps " MEMACCESS(3) ",%%xmm4 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "movdqa %%xmm5,%%xmm6 \n"
+ "mulps %%xmm1,%%xmm2 \n"
+ "mulps %%xmm5,%%xmm6 \n"
+ "mulps %%xmm2,%%xmm1 \n"
+ "mulps %%xmm6,%%xmm5 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
+ "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
+ "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
+ "addps %%xmm2,%%xmm0 \n"
+ "addps %%xmm6,%%xmm4 \n"
+ "addps %%xmm1,%%xmm0 \n"
+ "addps %%xmm5,%%xmm4 \n"
+ "cvttps2dq %%xmm0,%%xmm0 \n"
+ "cvttps2dq %%xmm4,%%xmm4 \n"
+ "packuswb %%xmm4,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "sub $0x2,%2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ asm volatile (
+ "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
+ "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
+ "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
+ "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
+
+ // 2 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
+ "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
+ "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
+ "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
+ "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
+ "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
+ "vcvttps2dq %%ymm0,%%ymm0 \n"
+ "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
+ "vpermq $0xd8,%%ymm0,%%ymm0 \n"
+ "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
+ "sub $0x2,%2 \n"
+ "vmovq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "jg 1b \n"
+ "vzeroupper \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(poly) // %3
+ : "memory", "cc"
+#if defined(__SSE2__)
+// TODO(fbarchard): declare ymm usage when applicable.
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ uintptr_t pixel_temp = 0u;
+ asm volatile (
+ // 1 pixel loop.
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOP(movzb,0x00,3,1,4) ",%1 \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOP(movzb,0x01,3,1,4) ",%1 \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOP(movzb,0x02,3,1,4) ",%1 \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
+ MEMOP(movzb,0x03,3,1,4) ",%1 \n" // movzb 0x3(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x1,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+ uintptr_t pixel_temp = 0u;
+ asm volatile (
+ // 1 pixel loop.
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movzb " MEMACCESS(0) ",%1 \n"
+ "lea " MEMLEA(0x4,0) ",%0 \n"
+ MEMOP(movzb,0x00,3,1,4) ",%1 \n" // movzb (%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x4,0) " \n"
+ "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
+ MEMOP(movzb,0x01,3,1,4) ",%1 \n" // movzb 0x1(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x3,0) " \n"
+ "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
+ MEMOP(movzb,0x02,3,1,4) ",%1 \n" // movzb 0x2(%3,%1,4),%1
+ "mov %b1," MEMACCESS2(-0x2,0) " \n"
+ "dec %2 \n"
+ "jg 1b \n"
+ : "+r"(dst_argb), // %0
+ "+d"(pixel_temp), // %1
+ "+r"(width) // %2
+ : "r"(table_argb) // %3
+ : "memory", "cc");
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width,
+ const uint8* luma, uint32 lumacoeff) {
+ uintptr_t pixel_temp = 0u;
+ uintptr_t table_temp = 0u;
+ asm volatile (
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pcmpeqb %%xmm4,%%xmm4 \n"
+ "psllw $0x8,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+
+ // 4 pixel loop.
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(2) ",%%xmm0 \n"
+ "pmaddubsw %%xmm3,%%xmm0 \n"
+ "phaddw %%xmm0,%%xmm0 \n"
+ "pand %%xmm4,%%xmm0 \n"
+ "punpcklwd %%xmm5,%%xmm0 \n"
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS(2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS(3) " \n"
+ "movzb " MEMACCESS2(0x1,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x1,3) " \n"
+ "movzb " MEMACCESS2(0x2,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x2,3) " \n"
+ "movzb " MEMACCESS2(0x3,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x3,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x4,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x4,3) " \n"
+ BUNDLEALIGN
+ "movzb " MEMACCESS2(0x5,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x5,3) " \n"
+ "movzb " MEMACCESS2(0x6,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x6,3) " \n"
+ "movzb " MEMACCESS2(0x7,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0x7,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+ "pshufd $0x39,%%xmm0,%%xmm0 \n"
+
+ "movzb " MEMACCESS2(0x8,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x8,3) " \n"
+ "movzb " MEMACCESS2(0x9,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0x9,3) " \n"
+ "movzb " MEMACCESS2(0xa,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xa,3) " \n"
+ "movzb " MEMACCESS2(0xb,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xb,3) " \n"
+
+ "movd %%xmm0,%k1 \n" // 32 bit offset
+ "add %5,%1 \n"
+
+ "movzb " MEMACCESS2(0xc,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xc,3) " \n"
+ "movzb " MEMACCESS2(0xd,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xd,3) " \n"
+ "movzb " MEMACCESS2(0xe,2) ",%0 \n"
+ MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0
+ "mov %b0," MEMACCESS2(0xe,3) " \n"
+ "movzb " MEMACCESS2(0xf,2) ",%0 \n"
+ "mov %b0," MEMACCESS2(0xf,3) " \n"
+ "sub $0x4,%4 \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "lea " MEMLEA(0x10,3) ",%3 \n"
+ "jg 1b \n"
+ : "+d"(pixel_temp), // %0
+ "+a"(table_temp), // %1
+ "+r"(src_argb), // %2
+ "+r"(dst_argb), // %3
+ "+rm"(width) // %4
+ : "r"(luma), // %5
+ "rm"(lumacoeff) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
#endif // defined(__x86_64__) || defined(__i386__)
diff --git a/chromium/third_party/libyuv/source/row_win.cc b/chromium/third_party/libyuv/source/row_win.cc
index 4ea06923def..502d25cea4f 100644
--- a/chromium/third_party/libyuv/source/row_win.cc
+++ b/chromium/third_party/libyuv/source/row_win.cc
@@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
};
-static const lvec8 kARGBToY_AVX = {
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
- 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
-
-static const lvec8 kARGBToYJ_AVX = {
- 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0,
- 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
-
static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
@@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = {
127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
};
-// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
-static const lvec8 kARGBToU_AVX = {
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
- 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
-
static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
@@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = {
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
};
-static const lvec8 kARGBToV_AVX = {
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
- -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
-};
-
// vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kShufARGBToY_AVX = {
+static const lvec32 kPermdARGBToY_AVX = {
0, 4, 1, 5, 2, 6, 3, 7
};
@@ -124,16 +103,6 @@ static const uvec8 kAddY16 = {
static const vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64
};
-static const lvec16 kAddYJ64_AVX = {
- 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-};
-
-static const ulvec8 kAddY16_AVX = {
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
- 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
@@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
};
-static const ulvec8 kAddUV128_AVX = {
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
- 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
-
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
@@ -191,7 +153,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- align 16
+ align 4
convertloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
@@ -220,7 +182,7 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
pslld xmm5, 24
- align 16
+ align 4
convertloop:
movq xmm0, qword ptr [eax]
lea eax, [eax + 8]
@@ -249,7 +211,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
pslld xmm5, 24
movdqa xmm4, kShuffleMaskRGB24ToARGB
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -289,7 +251,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
pslld xmm5, 24
movdqa xmm4, kShuffleMaskRAWToARGB
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -349,7 +311,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
sub edx, eax
sub edx, eax
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgr565
movdqa xmm1, xmm0
@@ -399,7 +361,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
sub edx, eax
sub edx, eax
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of 1555
movdqa xmm1, xmm0
@@ -445,7 +407,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
sub edx, eax
sub edx, eax
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
movdqa xmm2, xmm0
@@ -477,12 +439,12 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRGB24
- align 16
+ align 4
convertloop:
- movdqa xmm0, [eax] // fetch 16 pixels of argb
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
@@ -494,13 +456,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
por xmm0, xmm4 // 4 bytes from 1 for 0
pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqa [edx], xmm0 // store 0
+ movdqu [edx], xmm0 // store 0
por xmm1, xmm5 // 8 bytes from 2 for 1
psrldq xmm2, 8 // 4 bytes from 2
pslldq xmm3, 4 // 12 bytes from 3 for 2
por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqa [edx + 16], xmm1 // store 1
- movdqa [edx + 32], xmm2 // store 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
@@ -516,12 +478,12 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
mov ecx, [esp + 12] // pix
movdqa xmm6, kShuffleMaskARGBToRAW
- align 16
+ align 4
convertloop:
- movdqa xmm0, [eax] // fetch 16 pixels of argb
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + 32]
- movdqa xmm3, [eax + 48]
+ movdqu xmm0, [eax] // fetch 16 pixels of argb
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + 32]
+ movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
@@ -533,13 +495,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
por xmm0, xmm4 // 4 bytes from 1 for 0
pslldq xmm5, 8 // 8 bytes from 2 for 1
- movdqa [edx], xmm0 // store 0
+ movdqu [edx], xmm0 // store 0
por xmm1, xmm5 // 8 bytes from 2 for 1
psrldq xmm2, 8 // 4 bytes from 2
pslldq xmm3, 4 // 12 bytes from 3 for 2
por xmm2, xmm3 // 12 bytes from 3 for 2
- movdqa [edx + 16], xmm1 // store 1
- movdqa [edx + 32], xmm2 // store 2
+ movdqu [edx + 16], xmm1 // store 1
+ movdqu [edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
@@ -561,7 +523,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
pslld xmm5, 11
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
@@ -601,7 +563,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
pslld xmm7, 15
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0 // B
@@ -639,7 +601,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
movdqa xmm3, xmm4 // generate mask 0x00f000f0
psrlw xmm3, 8
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // fetch 4 pixels of argb
movdqa xmm1, xmm0
@@ -668,7 +630,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -703,7 +665,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -737,11 +699,11 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- vmovdqa ymm6, kShufARGBToY_AVX
- vmovdqa ymm5, kAddY16_AVX
- vmovdqa ymm4, kARGBToY_AVX
+ vbroadcastf128 ymm4, kARGBToY
+ vbroadcastf128 ymm5, kAddY16
+ vmovdqa ymm6, kPermdARGBToY_AVX
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -777,11 +739,11 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
- vmovdqa ymm4, kARGBToYJ_AVX
- vmovdqa ymm5, kAddYJ64_AVX
- vmovdqa ymm6, kShufARGBToY_AVX
+ vbroadcastf128 ymm4, kARGBToYJ
+ vbroadcastf128 ymm5, kAddYJ64
+ vmovdqa ymm6, kPermdARGBToY_AVX
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -820,7 +782,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kARGBToY
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -854,7 +816,7 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -889,7 +851,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -923,7 +885,7 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kBGRAToY
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -957,7 +919,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -991,7 +953,7 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kABGRToY
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -1025,7 +987,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kRGBAToY
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -1059,7 +1021,7 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
movdqa xmm5, kAddY16
movdqa xmm4, kRGBAToY
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -1100,7 +1062,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
@@ -1166,7 +1128,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUVJ128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
@@ -1229,12 +1191,12 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
- vmovdqa ymm7, kARGBToU_AVX
- vmovdqa ymm6, kARGBToV_AVX
- vmovdqa ymm5, kAddUV128_AVX
+ vbroadcastf128 ymm5, kAddUV128
+ vbroadcastf128 ymm6, kARGBToV
+ vbroadcastf128 ymm7, kARGBToU
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 32x2 argb pixels to 16x1 */
vmovdqu ymm0, [eax]
@@ -1300,7 +1262,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
@@ -1370,7 +1332,7 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUVJ128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
@@ -1439,7 +1401,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* convert to U and V */
movdqa xmm0, [eax] // U
@@ -1497,7 +1459,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* convert to U and V */
movdqu xmm0, [eax] // U
@@ -1555,7 +1517,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
@@ -1614,7 +1576,7 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
@@ -1675,7 +1637,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
@@ -1741,7 +1703,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
@@ -1811,7 +1773,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
@@ -1877,7 +1839,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
@@ -1947,7 +1909,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqa xmm0, [eax]
@@ -2013,7 +1975,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
movdqa xmm5, kAddUV128
sub edi, edx // stride from u to v
- align 16
+ align 4
convertloop:
/* step 1 - subsample 16x2 argb pixels to 8x1 */
movdqu xmm0, [eax]
@@ -2133,7 +2095,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
vpxor ymm4, ymm4, ymm4
- align 16
+ align 4
convertloop:
vmovq xmm0, qword ptr [esi] // U
vmovq xmm1, qword ptr [esi + edi] // V
@@ -2220,7 +2182,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
-// Read 8 UV from 411.
+// Read 8 UV from 444.
#define READYUV444 __asm { \
__asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
__asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
@@ -2239,8 +2201,10 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
// Read 2 UV from 411, upsample to 8 UV.
#define READYUV411 __asm { \
- __asm movd xmm0, [esi] /* U */ \
- __asm movd xmm1, [esi + edi] /* V */ \
+ __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
+ __asm movd xmm0, ebx \
+ __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
+ __asm movd xmm1, ebx \
__asm lea esi, [esi + 2] \
__asm punpcklbw xmm0, xmm1 /* UV */ \
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
@@ -2330,7 +2294,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV444
YUVTORGB
@@ -2374,7 +2338,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
movdqa xmm5, kShuffleMaskARGBToRGB24_0
movdqa xmm6, kShuffleMaskARGBToRGB24
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -2421,7 +2385,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf,
movdqa xmm5, kShuffleMaskARGBToRAW_0
movdqa xmm6, kShuffleMaskARGBToRAW
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -2473,7 +2437,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
pslld xmm7, 11
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -2540,7 +2504,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -2573,20 +2537,21 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
uint8* dst_argb,
int width) {
__asm {
+ push ebx
push esi
push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ecx, [esp + 12 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
- READYUV411
+ READYUV411 // modifies EBX
YUVTORGB
// Step 3: Weave into ARGB
@@ -2603,6 +2568,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
pop edi
pop esi
+ pop ebx
ret
}
}
@@ -2623,7 +2589,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READNV12
YUVTORGB
@@ -2661,7 +2627,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READNV12
YVUTORGB
@@ -2703,7 +2669,7 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV444
YUVTORGB
@@ -2746,7 +2712,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -2779,20 +2745,21 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
uint8* dst_argb,
int width) {
__asm {
+ push ebx
push esi
push edi
- mov eax, [esp + 8 + 4] // Y
- mov esi, [esp + 8 + 8] // U
- mov edi, [esp + 8 + 12] // V
- mov edx, [esp + 8 + 16] // argb
- mov ecx, [esp + 8 + 20] // width
+ mov eax, [esp + 12 + 4] // Y
+ mov esi, [esp + 12 + 8] // U
+ mov edi, [esp + 12 + 12] // V
+ mov edx, [esp + 12 + 16] // argb
+ mov ecx, [esp + 12 + 20] // width
sub edi, esi
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
- READYUV411
+ READYUV411 // modifies EBX
YUVTORGB
// Step 3: Weave into ARGB
@@ -2809,6 +2776,7 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
pop edi
pop esi
+ pop ebx
ret
}
}
@@ -2829,7 +2797,7 @@ void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READNV12
YUVTORGB
@@ -2867,7 +2835,7 @@ void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READNV12
YVUTORGB
@@ -2906,7 +2874,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
sub edi, esi
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -2947,7 +2915,7 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
sub edi, esi
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -2989,7 +2957,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -3030,7 +2998,7 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -3070,7 +3038,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
sub edi, esi
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -3111,7 +3079,7 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
sub edi, esi
pxor xmm4, xmm4
- align 16
+ align 4
convertloop:
READYUV422
YUVTORGB
@@ -3156,7 +3124,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
mov edx, [esp + 8] // rgb
mov ecx, [esp + 12] // width
- align 16
+ align 4
convertloop:
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
movq xmm0, qword ptr [eax]
@@ -3200,7 +3168,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
movdqa xmm5, kShuffleMirror
lea eax, [eax - 16]
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax + ecx]
pshufb xmm0, xmm5
@@ -3229,7 +3197,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqa ymm5, kShuffleMirror_AVX2
lea eax, [eax - 32]
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax + ecx]
vpshufb ymm0, ymm0, ymm5
@@ -3255,7 +3223,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
mov ecx, [esp + 12] // width
lea eax, [eax - 16]
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax + ecx]
movdqa xmm1, xmm0 // swap bytes
@@ -3293,7 +3261,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
lea eax, [eax + ecx * 2 - 16]
sub edi, edx
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
lea eax, [eax - 16]
@@ -3322,12 +3290,13 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // width
+ lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
movdqa xmm5, kARGBShuffleMirror
- lea eax, [eax - 16]
- align 16
+ align 4
convertloop:
- movdqa xmm0, [eax + ecx * 4]
+ movdqa xmm0, [eax]
+ lea eax, [eax - 16]
pshufb xmm0, xmm5
sub ecx, 4
movdqa [edx], xmm0
@@ -3353,7 +3322,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
lea eax, [eax - 32]
vmovdqa ymm5, kARGBShuffleMirror_AVX2
- align 16
+ align 4
convertloop:
vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
sub ecx, 8
@@ -3379,7 +3348,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -3416,7 +3385,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -3454,7 +3423,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3492,7 +3461,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
mov ecx, [esp + 4 + 16] // width
sub edx, eax
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // read 16 U's
movdqa xmm1, [eax + edx] // and 16 V's
@@ -3522,7 +3491,7 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
mov ecx, [esp + 4 + 16] // width
sub edx, eax
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax] // read 16 U's
movdqu xmm1, [eax + edx] // and 16 V's
@@ -3554,7 +3523,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
mov ecx, [esp + 4 + 16] // width
sub edx, eax
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax] // read 32 U's
vmovdqu ymm1, [eax + edx] // and 32 V's
@@ -3584,15 +3553,15 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
mov eax, [esp + 4] // src
mov edx, [esp + 8] // dst
mov ecx, [esp + 12] // count
- sub edx, eax
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
- movdqa [eax + edx], xmm0
- movdqa [eax + edx + 16], xmm1
lea eax, [eax + 32]
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
sub ecx, 32
jg convertloop
ret
@@ -3634,6 +3603,144 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
}
#endif // HAS_COPYROW_X86
+#ifdef HAS_ARGBCOPYALPHAROW_SSE2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ align 4
+ convertloop:
+ movdqa xmm2, [eax]
+ movdqa xmm3, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm4, [edx]
+ movdqa xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYALPHAROW_AVX2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ align 4
+ convertloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + 32]
+ lea eax, [eax + 64]
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYALPHAROW_AVX2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ pcmpeqb xmm0, xmm0 // generate mask 0xff000000
+ pslld xmm0, 24
+ pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
+ psrld xmm1, 8
+
+ align 4
+ convertloop:
+ movq xmm2, qword ptr [eax] // 8 Y's
+ lea eax, [eax + 8]
+ punpcklbw xmm2, xmm2
+ punpckhwd xmm3, xmm2
+ punpcklwd xmm2, xmm2
+ movdqa xmm4, [edx]
+ movdqa xmm5, [edx + 16]
+ pand xmm2, xmm0
+ pand xmm3, xmm0
+ pand xmm4, xmm1
+ pand xmm5, xmm1
+ por xmm2, xmm4
+ por xmm3, xmm5
+ movdqa [edx], xmm2
+ movdqa [edx + 16], xmm3
+ lea edx, [edx + 32]
+ sub ecx, 8
+ jg convertloop
+
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
+
+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
+// width in pixels
+__declspec(naked) __declspec(align(16))
+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov edx, [esp + 8] // dst
+ mov ecx, [esp + 12] // count
+ vpcmpeqb ymm0, ymm0, ymm0
+ vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
+
+ align 4
+ convertloop:
+ vpmovzxbd ymm1, qword ptr [eax]
+ vpmovzxbd ymm2, qword ptr [eax + 8]
+ lea eax, [eax + 16]
+ vpslld ymm1, ymm1, 24
+ vpslld ymm2, ymm2, 24
+ vpblendvb ymm1, ymm1, [edx], ymm0
+ vpblendvb ymm2, ymm2, [edx + 32], ymm0
+ vmovdqu [edx], ymm1
+ vmovdqu [edx + 32], ymm2
+ lea edx, [edx + 64]
+ sub ecx, 16
+ jg convertloop
+
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
+
#ifdef HAS_SETROW_X86
// SetRow8 writes 'count' bytes using a 32 bit value repeated.
__declspec(naked) __declspec(align(16))
@@ -3666,7 +3773,7 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
lea ecx, [ebp * 4]
sub edx, ecx // stride - width * 4
- align 16
+ align 4
convertloop:
mov ecx, ebp
rep stosd
@@ -3693,7 +3800,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3726,7 +3833,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3769,7 +3876,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3804,7 +3911,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3837,7 +3944,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3880,7 +3987,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
vpsrlw ymm5, ymm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -3919,7 +4026,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -3950,7 +4057,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -3992,7 +4099,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -4026,7 +4133,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
psrlw xmm5, 8
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4057,7 +4164,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4099,7 +4206,7 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4131,7 +4238,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -4162,7 +4269,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -4204,7 +4311,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -4236,7 +4343,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4267,7 +4374,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4309,7 +4416,7 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
psrlw xmm5, 8
sub edi, edx
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -4479,7 +4586,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- pcmpeqb xmm7, xmm7 // generate constant 1
+ pcmpeqb xmm7, xmm7 // generate constant 0x0001
psrlw xmm7, 15
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
psrlw xmm6, 8
@@ -4624,13 +4731,12 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
- sub edx, eax
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
pslld xmm4, 24
pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
psrld xmm5, 8
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm0 // first 2
@@ -4643,6 +4749,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pshuflw xmm2, xmm2, 0FFh
pmulhuw xmm1, xmm2 // rgb * a
movdqa xmm2, [eax] // alphas
+ lea eax, [eax + 16]
psrlw xmm0, 8
pand xmm2, xmm4
psrlw xmm1, 8
@@ -4650,8 +4757,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
pand xmm0, xmm5 // keep original alphas
por xmm0, xmm2
sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
jg convertloop
ret
@@ -4674,33 +4781,33 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
mov eax, [esp + 4] // src_argb0
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
- sub edx, eax
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
pslld xmm3, 24
movdqa xmm4, kShuffleAlpha0
movdqa xmm5, kShuffleAlpha1
- align 16
+ align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
pshufb xmm0, xmm4 // isolate first 2 alphas
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
punpcklbw xmm1, xmm1 // first 2 pixel rgbs
pmulhuw xmm0, xmm1 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
pshufb xmm1, xmm5 // isolate next 2 alphas
- movdqa xmm2, [eax] // read 4 pixels
+ movdqu xmm2, [eax] // read 4 pixels
punpckhbw xmm2, xmm2 // next 2 pixel rgbs
pmulhuw xmm1, xmm2 // rgb * a
- movdqa xmm2, [eax] // mask original alpha
+ movdqu xmm2, [eax] // mask original alpha
+ lea eax, [eax + 16]
pand xmm2, xmm3
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
por xmm0, xmm2 // copy original alpha
sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
jg convertloop
ret
@@ -4727,7 +4834,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
vpslld ymm5, ymm5, 24
- align 16
+ align 4
convertloop:
vmovdqu ymm6, [eax] // read 8 pixels.
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
@@ -4764,11 +4871,10 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 8 + 4] // src_argb0
mov edx, [esp + 8 + 8] // dst_argb
mov ecx, [esp + 8 + 12] // width
- sub edx, eax
- align 16
+ align 4
convertloop:
- movdqa xmm0, [eax] // read 4 pixels
+ movdqu xmm0, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 3] // first alpha
movzx edi, byte ptr [eax + 7] // second alpha
punpcklbw xmm0, xmm0 // first 2
@@ -4779,7 +4885,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
movlhps xmm2, xmm3
pmulhuw xmm0, xmm2 // rgb * a
- movdqa xmm1, [eax] // read 4 pixels
+ movdqu xmm1, [eax] // read 4 pixels
movzx esi, byte ptr [eax + 11] // third alpha
movzx edi, byte ptr [eax + 15] // forth alpha
punpckhbw xmm1, xmm1 // next 2
@@ -4789,11 +4895,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
movlhps xmm2, xmm3
pmulhuw xmm1, xmm2 // rgb * a
+ lea eax, [eax + 16]
packuswb xmm0, xmm1
sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
jg convertloop
pop edi
pop esi
@@ -4821,7 +4928,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
sub edx, eax
vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
- align 16
+ align 4
convertloop:
vmovdqu ymm6, [eax] // read 8 pixels.
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
@@ -4860,7 +4967,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
push esi
push edi
- align 16
+ align 4
convertloop:
// replace VPGATHER
movzx esi, byte ptr [eax + 3] // alpha0
@@ -4922,9 +5029,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
mov ecx, [esp + 12] /* width */
movdqa xmm4, kARGBToYJ
movdqa xmm5, kAddYJ64
- sub edx, eax
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // G
movdqa xmm1, [eax + 16]
@@ -4936,6 +5042,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
packuswb xmm0, xmm0 // 8 G bytes
movdqa xmm2, [eax] // A
movdqa xmm3, [eax + 16]
+ lea eax, [eax + 32]
psrld xmm2, 24
psrld xmm3, 24
packuswb xmm2, xmm3
@@ -4947,9 +5054,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
punpcklwd xmm0, xmm3 // GGGA first 4
punpckhwd xmm1, xmm3 // GGGA next 4
sub ecx, 8
- movdqa [eax + edx], xmm0
- movdqa [eax + edx + 16], xmm1
- lea eax, [eax + 32]
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
jg convertloop
ret
}
@@ -4983,7 +5090,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
movdqa xmm3, kARGBToSepiaG
movdqa xmm4, kARGBToSepiaR
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // B
movdqa xmm6, [eax + 16]
@@ -5033,111 +5140,65 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
__declspec(naked) __declspec(align(16))
-void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
- int width) {
+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
__asm {
- mov eax, [esp + 4] /* dst_argb */
- mov edx, [esp + 8] /* matrix_argb */
- mov ecx, [esp + 12] /* width */
- movd xmm2, [edx]
- movd xmm3, [edx + 4]
- movd xmm4, [edx + 8]
- pshufd xmm2, xmm2, 0
- pshufd xmm3, xmm3, 0
- pshufd xmm4, xmm4, 0
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* matrix_argb */
+ movdqu xmm5, [ecx]
+ pshufd xmm2, xmm5, 0x00
+ pshufd xmm3, xmm5, 0x55
+ pshufd xmm4, xmm5, 0xaa
+ pshufd xmm5, xmm5, 0xff
+ mov ecx, [esp + 16] /* width */
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // B
- movdqa xmm6, [eax + 16]
+ movdqa xmm7, [eax + 16]
pmaddubsw xmm0, xmm2
- pmaddubsw xmm6, xmm2
- movdqa xmm5, [eax] // G
+ pmaddubsw xmm7, xmm2
+ movdqa xmm6, [eax] // G
movdqa xmm1, [eax + 16]
- pmaddubsw xmm5, xmm3
+ pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
- phaddsw xmm0, xmm6 // B
- phaddsw xmm5, xmm1 // G
- psraw xmm0, 7 // B
- psraw xmm5, 7 // G
+ phaddsw xmm0, xmm7 // B
+ phaddsw xmm6, xmm1 // G
+ psraw xmm0, 6 // B
+ psraw xmm6, 6 // G
packuswb xmm0, xmm0 // 8 B values
- packuswb xmm5, xmm5 // 8 G values
- punpcklbw xmm0, xmm5 // 8 BG values
- movdqa xmm5, [eax] // R
- movdqa xmm1, [eax + 16]
- pmaddubsw xmm5, xmm4
+ packuswb xmm6, xmm6 // 8 G values
+ punpcklbw xmm0, xmm6 // 8 BG values
+ movdqa xmm1, [eax] // R
+ movdqa xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
- phaddsw xmm5, xmm1
- psraw xmm5, 7
- packuswb xmm5, xmm5 // 8 R values
+ pmaddubsw xmm7, xmm4
+ phaddsw xmm1, xmm7 // R
movdqa xmm6, [eax] // A
- movdqa xmm1, [eax + 16]
- psrld xmm6, 24
- psrld xmm1, 24
- packuswb xmm6, xmm1
+ movdqa xmm7, [eax + 16]
+ pmaddubsw xmm6, xmm5
+ pmaddubsw xmm7, xmm5
+ phaddsw xmm6, xmm7 // A
+ psraw xmm1, 6 // R
+ psraw xmm6, 6 // A
+ packuswb xmm1, xmm1 // 8 R values
packuswb xmm6, xmm6 // 8 A values
- movdqa xmm1, xmm0 // Weave BG, RA together
- punpcklbw xmm5, xmm6 // 8 RA values
- punpcklwd xmm0, xmm5 // BGRA first 4
- punpckhwd xmm1, xmm5 // BGRA next 4
+ punpcklbw xmm1, xmm6 // 8 RA values
+ movdqa xmm6, xmm0 // Weave BG, RA together
+ punpcklwd xmm0, xmm1 // BGRA first 4
+ punpckhwd xmm6, xmm1 // BGRA next 4
sub ecx, 8
- movdqa [eax], xmm0
- movdqa [eax + 16], xmm1
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm6
lea eax, [eax + 32]
+ lea edx, [edx + 32]
jg convertloop
ret
}
}
#endif // HAS_ARGBCOLORMATRIXROW_SSSE3
-#ifdef HAS_ARGBCOLORTABLEROW_X86
-// Tranform ARGB pixels with color table.
-__declspec(naked) __declspec(align(16))
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
- int width) {
- __asm {
- push ebx
- push esi
- push edi
- push ebp
- mov eax, [esp + 16 + 4] /* dst_argb */
- mov edi, [esp + 16 + 8] /* table_argb */
- mov ecx, [esp + 16 + 12] /* width */
- xor ebx, ebx
- xor edx, edx
-
- align 16
- convertloop:
- mov ebp, dword ptr [eax] // BGRA
- mov esi, ebp
- and ebp, 255
- shr esi, 8
- and esi, 255
- mov bl, [edi + ebp * 4 + 0] // B
- mov dl, [edi + esi * 4 + 1] // G
- mov ebp, dword ptr [eax] // BGRA
- mov esi, ebp
- shr ebp, 16
- shr esi, 24
- and ebp, 255
- mov [eax], bl
- mov [eax + 1], dl
- mov bl, [edi + ebp * 4 + 2] // R
- mov dl, [edi + esi * 4 + 3] // A
- mov [eax + 2], bl
- mov [eax + 3], dl
- lea eax, [eax + 4]
- sub ecx, 1
- jg convertloop
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-#endif // HAS_ARGBCOLORTABLEROW_X86
-
#ifdef HAS_ARGBQUANTIZEROW_SSE2
// Quantize 4 ARGB pixels (16 bytes).
// Aligned to 16 bytes.
@@ -5160,7 +5221,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
pcmpeqb xmm6, xmm6 // generate mask 0xff000000
pslld xmm6, 24
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels
@@ -5196,13 +5257,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
movd xmm2, [esp + 16] // value
- sub edx, eax
punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // read 4 pixels
+ lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2
@@ -5212,8 +5273,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
psrlw xmm1, 8
packuswb xmm0, xmm1
sub ecx, 4
- movdqa [eax + edx], xmm0
- lea eax, [eax + 16]
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
jg convertloop
ret
@@ -5233,25 +5294,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
- sub esi, eax
- sub edx, eax
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
- movdqu xmm2, [eax + esi] // read 4 pixels from src_argb1
+ movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
- punpcklbw xmm0, xmm0 // first 2
- punpckhbw xmm1, xmm1 // next 2
- punpcklbw xmm2, xmm5 // first 2
- punpckhbw xmm3, xmm5 // next 2
- pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
- pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ punpcklbw xmm0, xmm0 // first 2
+ punpckhbw xmm1, xmm1 // next 2
+ punpcklbw xmm2, xmm5 // first 2
+ punpckhbw xmm3, xmm5 // next 2
+ pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
+ pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
+ lea eax, [eax + 16]
+ lea esi, [esi + 16]
packuswb xmm0, xmm1
sub ecx, 4
- movdqu [eax + edx], xmm0
- lea eax, [eax + 16]
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
jg convertloop
pop esi
@@ -5272,20 +5333,20 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- sub edx, eax
sub ecx, 4
jl convertloop49
- align 16
+ align 4
convertloop4:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
- movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 4
- movdqu [eax + edx], xmm0
- lea eax, [eax + 16]
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
jge convertloop4
convertloop49:
@@ -5294,11 +5355,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
convertloop1:
movd xmm0, [eax] // read 1 pixels from src_argb0
- movd xmm1, [eax + esi] // read 1 pixels from src_argb1
+ lea eax, [eax + 4]
+ movd xmm1, [esi] // read 1 pixels from src_argb1
+ lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
sub ecx, 1
- movd [eax + edx], xmm0
- lea eax, [eax + 4]
+ movd [edx], xmm0
+ lea edx, [edx + 4]
jge convertloop1
convertloop19:
@@ -5319,17 +5382,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- sub edx, eax
- align 16
+ align 4
convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
- movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1
+ lea eax, [eax + 16]
+ movdqu xmm1, [esi] // read 4 pixels from src_argb1
+ lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb0 - src_argb1
sub ecx, 4
- movdqu [eax + edx], xmm0
- lea eax, [eax + 16]
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
jg convertloop
pop esi
@@ -5349,14 +5412,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- vpxor ymm5, ymm5, ymm5 // constant 0
- sub esi, eax
- sub edx, eax
+ vpxor ymm5, ymm5, ymm5 // constant 0
- align 16
+ align 4
convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
- vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1
+ lea eax, [eax + 32]
+ vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
+ lea esi, [esi + 32]
vpunpcklbw ymm0, ymm1, ymm1 // low 4
vpunpckhbw ymm1, ymm1, ymm1 // high 4
vpunpcklbw ymm2, ymm3, ymm5 // low 4
@@ -5364,8 +5427,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1
- vmovdqu [eax + edx], ymm0
- lea eax, [eax + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -5387,15 +5450,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- sub edx, eax
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1
- vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
+ vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -5417,15 +5480,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
- sub esi, eax
- sub edx, eax
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
- vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1
- vmovdqu [eax + edx], ymm0
lea eax, [eax + 32]
+ vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
+ lea esi, [esi + 32]
+ vmovdqu [edx], ymm0
+ lea edx, [edx + 32]
sub ecx, 8
jg convertloop
@@ -5436,14 +5499,14 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
}
#endif // HAS_ARGBSUBTRACTROW_AVX2
-#ifdef HAS_SOBELXROW_SSSE3
+#ifdef HAS_SOBELXROW_SSE2
// SobelX as a matrix is
// -1 0 1
// -2 0 2
// -1 0 1
__declspec(naked) __declspec(align(16))
-void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
- const uint8* src_y2, uint8* dst_sobelx, int width) {
+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
__asm {
push esi
push edi
@@ -5457,7 +5520,7 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
sub edx, eax
pxor xmm5, xmm5 // constant 0
- align 16
+ align 4
convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
@@ -5477,7 +5540,9 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
paddw xmm0, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm1
- pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead.
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [eax + edx], xmm0
@@ -5489,16 +5554,16 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
ret
}
}
-#endif // HAS_SOBELXROW_SSSE3
+#endif // HAS_SOBELXROW_SSE2
-#ifdef HAS_SOBELYROW_SSSE3
+#ifdef HAS_SOBELYROW_SSE2
// SobelY as a matrix is
// -1 -2 -1
// 0 0 0
// 1 2 1
__declspec(naked) __declspec(align(16))
-void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
- uint8* dst_sobely, int width) {
+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_y0
@@ -5509,7 +5574,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
sub edx, eax
pxor xmm5, xmm5 // constant 0
- align 16
+ align 4
convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
@@ -5529,7 +5594,9 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
paddw xmm0, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm1
- pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead.
+ pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
+ psubw xmm1, xmm0
+ pmaxsw xmm0, xmm1
packuswb xmm0, xmm0
sub ecx, 8
movq qword ptr [eax + edx], xmm0
@@ -5540,7 +5607,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
ret
}
}
-#endif // HAS_SOBELYROW_SSSE3
+#endif // HAS_SOBELYROW_SSE2
#ifdef HAS_SOBELROW_SSE2
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -5550,7 +5617,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1,
// B = Sobel
__declspec(naked) __declspec(align(16))
void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
- uint8* dst_argb, int width) {
+ uint8* dst_argb, int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_sobelx
@@ -5561,7 +5628,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
pcmpeqb xmm5, xmm5 // alpha 255
pslld xmm5, 24 // 0xff000000
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // read 16 pixels src_sobelx
movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
@@ -5594,6 +5661,36 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
}
#endif // HAS_SOBELROW_SSE2
+#ifdef HAS_SOBELTOPLANEROW_SSE2
+// Adds Sobel X and Sobel Y and stores Sobel into a plane.
+__declspec(naked) __declspec(align(16))
+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_sobelx
+ mov esi, [esp + 4 + 8] // src_sobely
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // width
+ sub esi, eax
+
+ align 4
+ convertloop:
+ movdqa xmm0, [eax] // read 16 pixels src_sobelx
+ movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
+ lea eax, [eax + 16]
+ paddusb xmm0, xmm1 // sobel = sobelx + sobely
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_SOBELTOPLANEROW_SSE2
+
#ifdef HAS_SOBELXYROW_SSE2
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
@@ -5610,9 +5707,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
- pcmpeqb xmm5, xmm5 // alpha 255
+ pcmpeqb xmm5, xmm5 // alpha 255
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax] // read 16 pixels src_sobelx
movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
@@ -5666,15 +5763,70 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
mov eax, topleft // eax topleft
mov esi, botleft // esi botleft
mov edx, width
- movd xmm4, area
+ movd xmm5, area
mov edi, dst
mov ecx, count
- cvtdq2ps xmm4, xmm4
- rcpss xmm4, xmm4 // 1.0f / area
+ cvtdq2ps xmm5, xmm5
+ rcpss xmm4, xmm5 // 1.0f / area
pshufd xmm4, xmm4, 0
sub ecx, 4
jl l4b
+ cmp area, 128 // 128 pixels will not overflow 15 bits.
+ ja l4
+
+ pshufd xmm5, xmm5, 0 // area
+ pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
+ psrld xmm6, 16
+ cvtdq2ps xmm6, xmm6
+ addps xmm5, xmm6 // (65536.0 + area - 1)
+ mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
+ cvtps2dq xmm5, xmm5 // 0.16 fixed point
+ packssdw xmm5, xmm5 // 16 bit shorts
+
+ // 4 pixel loop small blocks.
+ align 4
+ s4:
+ // top left
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + 32]
+ movdqa xmm3, [eax + 48]
+
+ // - top right
+ psubd xmm0, [eax + edx * 4]
+ psubd xmm1, [eax + edx * 4 + 16]
+ psubd xmm2, [eax + edx * 4 + 32]
+ psubd xmm3, [eax + edx * 4 + 48]
+ lea eax, [eax + 64]
+
+ // - bottom left
+ psubd xmm0, [esi]
+ psubd xmm1, [esi + 16]
+ psubd xmm2, [esi + 32]
+ psubd xmm3, [esi + 48]
+
+ // + bottom right
+ paddd xmm0, [esi + edx * 4]
+ paddd xmm1, [esi + edx * 4 + 16]
+ paddd xmm2, [esi + edx * 4 + 32]
+ paddd xmm3, [esi + edx * 4 + 48]
+ lea esi, [esi + 64]
+
+ packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
+ packssdw xmm2, xmm3
+
+ pmulhuw xmm0, xmm5
+ pmulhuw xmm2, xmm5
+
+ packuswb xmm0, xmm2
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ sub ecx, 4
+ jge s4
+
+ jmp l4b
+
// 4 pixel loop
align 4
l4:
@@ -5761,7 +5913,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
mov edx, cumsum
mov esi, previous_cumsum
mov ecx, width
- sub esi, edx
pxor xmm0, xmm0
pxor xmm1, xmm1
@@ -5788,19 +5939,20 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpckhwd xmm5, xmm1
paddd xmm0, xmm2
- movdqa xmm2, [edx + esi] // previous row above.
+ movdqa xmm2, [esi] // previous row above.
paddd xmm2, xmm0
paddd xmm0, xmm3
- movdqa xmm3, [edx + esi + 16]
+ movdqa xmm3, [esi + 16]
paddd xmm3, xmm0
paddd xmm0, xmm4
- movdqa xmm4, [edx + esi + 32]
+ movdqa xmm4, [esi + 32]
paddd xmm4, xmm0
paddd xmm0, xmm5
- movdqa xmm5, [edx + esi + 48]
+ movdqa xmm5, [esi + 48]
+ lea esi, [esi + 64]
paddd xmm5, xmm0
movdqa [edx], xmm2
@@ -5824,7 +5976,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
punpcklbw xmm2, xmm1
punpcklwd xmm2, xmm1
paddd xmm0, xmm2
- movdqu xmm2, [edx + esi]
+ movdqu xmm2, [esi]
+ lea esi, [esi + 16]
paddd xmm2, xmm0
movdqu [edx], xmm2
lea edx, [edx + 16]
@@ -5845,7 +5998,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
__asm {
push esi
push edi
- mov eax, [esp + 12] // src_argb
+ mov eax, [esp + 12] // src_argb
mov esi, [esp + 16] // stride
mov edx, [esp + 20] // dst_argb
mov ecx, [esp + 24] // pointer to uv_dudv
@@ -5923,6 +6076,108 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
}
#endif // HAS_ARGBAFFINEROW_SSE2
+#ifdef HAS_INTERPOLATEROW_AVX2
+// Bilinear filter 16x2 -> 16x1
+__declspec(naked) __declspec(align(16))
+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width,
+ int source_y_fraction) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_ptr
+ mov esi, [esp + 8 + 8] // src_ptr
+ mov edx, [esp + 8 + 12] // src_stride
+ mov ecx, [esp + 8 + 16] // dst_width
+ mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
+ shr eax, 1
+ // Dispatch to specialized filters if applicable.
+ cmp eax, 0
+ je xloop100 // 0 / 128. Blend 100 / 0.
+ sub edi, esi
+ cmp eax, 32
+ je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
+ cmp eax, 64
+ je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
+ cmp eax, 96
+ je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
+
+ vmovd xmm0, eax // high fraction 0..127
+ neg eax
+ add eax, 128
+ vmovd xmm5, eax // low fraction 128..1
+ vpunpcklbw xmm5, xmm5, xmm0
+ vpunpcklwd xmm5, xmm5, xmm5
+ vpxor ymm0, ymm0, ymm0
+ vpermd ymm5, ymm0, ymm5
+
+ align 4
+ xloop:
+ vmovdqu ymm0, [esi]
+ vmovdqu ymm2, [esi + edx]
+ vpunpckhbw ymm1, ymm0, ymm2 // mutates
+ vpunpcklbw ymm0, ymm0, ymm2 // mutates
+ vpmaddubsw ymm0, ymm0, ymm5
+ vpmaddubsw ymm1, ymm1, ymm5
+ vpsrlw ymm0, ymm0, 7
+ vpsrlw ymm1, ymm1, 7
+ vpackuswb ymm0, ymm0, ymm1 // unmutates
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop
+ jmp xloop99
+
+ // Blend 25 / 75.
+ align 4
+ xloop25:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ vpavgb ymm0, ymm0, [esi + edx]
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop25
+ jmp xloop99
+
+ // Blend 50 / 50.
+ align 4
+ xloop50:
+ vmovdqu ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi + edx]
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop50
+ jmp xloop99
+
+ // Blend 75 / 25.
+ align 4
+ xloop75:
+ vmovdqu ymm0, [esi + edx]
+ vpavgb ymm0, ymm0, [esi]
+ vpavgb ymm0, ymm0, [esi]
+ sub ecx, 32
+ vmovdqu [esi + edi], ymm0
+ lea esi, [esi + 32]
+ jg xloop75
+ jmp xloop99
+
+ // Blend 100 / 0 - Copy row unchanged.
+ align 4
+ xloop100:
+ rep movsb
+
+ xloop99:
+ pop edi
+ pop esi
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_INTERPOLATEROW_AVX2
+
+#ifdef HAS_INTERPOLATEROW_SSSE3
// Bilinear filter 16x2 -> 16x1
__declspec(naked) __declspec(align(16))
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
@@ -5956,7 +6211,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
- align 16
+ align 4
xloop:
movdqa xmm0, [esi]
movdqa xmm2, [esi + edx]
@@ -5975,7 +6230,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 25 / 75.
- align 16
+ align 4
xloop25:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
@@ -5988,7 +6243,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 50 / 50.
- align 16
+ align 4
xloop50:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
@@ -6000,7 +6255,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 75 / 25.
- align 16
+ align 4
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
@@ -6013,7 +6268,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
- align 16
+ align 4
xloop100:
movdqa xmm0, [esi]
sub ecx, 16
@@ -6027,6 +6282,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
ret
}
}
+#endif // HAS_INTERPOLATEROW_SSSE3
#ifdef HAS_INTERPOLATEROW_SSE2
// Bilinear filter 16x2 -> 16x1
@@ -6061,7 +6317,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
punpcklqdq xmm5, xmm5
pxor xmm4, xmm4
- align 16
+ align 4
xloop:
movdqa xmm0, [esi] // row0
movdqa xmm2, [esi + edx] // row1
@@ -6087,7 +6343,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 25 / 75.
- align 16
+ align 4
xloop25:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
@@ -6100,7 +6356,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 50 / 50.
- align 16
+ align 4
xloop50:
movdqa xmm0, [esi]
movdqa xmm1, [esi + edx]
@@ -6112,7 +6368,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 75 / 25.
- align 16
+ align 4
xloop75:
movdqa xmm1, [esi]
movdqa xmm0, [esi + edx]
@@ -6125,7 +6381,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
- align 16
+ align 4
xloop100:
movdqa xmm0, [esi]
sub ecx, 16
@@ -6174,7 +6430,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
punpcklwd xmm5, xmm5
pshufd xmm5, xmm5, 0
- align 16
+ align 4
xloop:
movdqu xmm0, [esi]
movdqu xmm2, [esi + edx]
@@ -6193,7 +6449,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 25 / 75.
- align 16
+ align 4
xloop25:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@@ -6206,7 +6462,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 50 / 50.
- align 16
+ align 4
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@@ -6218,7 +6474,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 75 / 25.
- align 16
+ align 4
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
@@ -6231,7 +6487,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
- align 16
+ align 4
xloop100:
movdqu xmm0, [esi]
sub ecx, 16
@@ -6279,7 +6535,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
punpcklqdq xmm5, xmm5
pxor xmm4, xmm4
- align 16
+ align 4
xloop:
movdqu xmm0, [esi] // row0
movdqu xmm2, [esi + edx] // row1
@@ -6305,7 +6561,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 25 / 75.
- align 16
+ align 4
xloop25:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@@ -6318,7 +6574,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 50 / 50.
- align 16
+ align 4
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@@ -6330,7 +6586,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 75 / 25.
- align 16
+ align 4
xloop75:
movdqu xmm1, [esi]
movdqu xmm0, [esi + edx]
@@ -6343,7 +6599,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
- align 16
+ align 4
xloop100:
movdqu xmm0, [esi]
sub ecx, 16
@@ -6370,7 +6626,7 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
mov ecx, [esp + 4 + 16] // pix
sub edi, eax
- align 16
+ align 4
convertloop:
movdqa xmm0, [eax]
pavgb xmm0, [eax + edx]
@@ -6395,7 +6651,7 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
mov ecx, [esp + 4 + 16] // pix
sub edi, eax
- align 16
+ align 4
convertloop:
vmovdqu ymm0, [eax]
vpavgb ymm0, ymm0, [eax + edx]
@@ -6421,7 +6677,7 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
mov ecx, [esp + 16] // pix
pshufd xmm5, xmm5, 0
- align 16
+ align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -6437,18 +6693,49 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
}
}
+// Specialized ARGB to Bayer that just isolates G channel.
+__declspec(naked) __declspec(align(16))
+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ mov edx, [esp + 8] // dst_bayer
+ // selector
+ mov ecx, [esp + 16] // pix
+ pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
+ psrld xmm5, 24
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrld xmm0, 8 // Move green to bottom.
+ psrld xmm1, 8
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packssdw xmm0, xmm1
+ packuswb xmm0, xmm1
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+ ret
+ }
+}
+
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
__declspec(naked) __declspec(align(16))
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
movdqa xmm5, [ecx]
mov ecx, [esp + 16] // pix
- align 16
+ align 4
wloop:
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
@@ -6469,12 +6756,12 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
movdqa xmm5, [ecx]
mov ecx, [esp + 16] // pix
- align 16
+ align 4
wloop:
movdqu xmm0, [eax]
movdqu xmm1, [eax + 16]
@@ -6496,13 +6783,12 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
- mov edx, [esp + 8] // dst_bayer
+ mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
- vmovdqa xmm5, [ecx]
- vpermq ymm5, ymm5, 0x44 // same shuffle in high as low.
+ vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
mov ecx, [esp + 16] // pix
- align 16
+ align 4
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
@@ -6519,7 +6805,127 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
ret
}
}
-#endif
+#endif // HAS_ARGBSHUFFLEROW_AVX2
+
+__declspec(naked) __declspec(align(16))
+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ __asm {
+ push ebx
+ push esi
+ mov eax, [esp + 8 + 4] // src_argb
+ mov edx, [esp + 8 + 8] // dst_argb
+ mov esi, [esp + 8 + 12] // shuffler
+ mov ecx, [esp + 8 + 16] // pix
+ pxor xmm5, xmm5
+
+ mov ebx, [esi] // shuffler
+ cmp ebx, 0x03000102
+ je shuf_3012
+ cmp ebx, 0x00010203
+ je shuf_0123
+ cmp ebx, 0x00030201
+ je shuf_0321
+ cmp ebx, 0x02010003
+ je shuf_2103
+
+ // TODO(fbarchard): Use one source pointer and 3 offsets.
+ shuf_any1:
+ movzx ebx, byte ptr [esi]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx], bl
+ movzx ebx, byte ptr [esi + 1]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 1], bl
+ movzx ebx, byte ptr [esi + 2]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 2], bl
+ movzx ebx, byte ptr [esi + 3]
+ movzx ebx, byte ptr [eax + ebx]
+ mov [edx + 3], bl
+ lea eax, [eax + 4]
+ lea edx, [edx + 4]
+ sub ecx, 1
+ jg shuf_any1
+ jmp shuf99
+
+ align 4
+ shuf_0123:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
+ pshuflw xmm0, xmm0, 01Bh
+ pshufhw xmm1, xmm1, 01Bh
+ pshuflw xmm1, xmm1, 01Bh
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_0123
+ jmp shuf99
+
+ align 4
+ shuf_0321:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
+ pshuflw xmm0, xmm0, 039h
+ pshufhw xmm1, xmm1, 039h
+ pshuflw xmm1, xmm1, 039h
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_0321
+ jmp shuf99
+
+ align 4
+ shuf_2103:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
+ pshuflw xmm0, xmm0, 093h
+ pshufhw xmm1, xmm1, 093h
+ pshuflw xmm1, xmm1, 093h
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_2103
+ jmp shuf99
+
+ align 4
+ shuf_3012:
+ movdqu xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm5
+ punpckhbw xmm1, xmm5
+ pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
+ pshuflw xmm0, xmm0, 0C6h
+ pshufhw xmm1, xmm1, 0C6h
+ pshuflw xmm1, xmm1, 0C6h
+ packuswb xmm0, xmm1
+ sub ecx, 4
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg shuf_3012
+
+ shuf99:
+ pop esi
+ pop ebx
+ ret
+ }
+}
// YUY2 - Macro-pixel = 2 image pixels
// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
@@ -6542,7 +6948,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width
sub edx, esi
- align 16
+ align 4
convertloop:
movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
@@ -6580,7 +6986,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
mov ecx, [esp + 8 + 20] // width
sub edx, esi
- align 16
+ align 4
convertloop:
movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
@@ -6602,6 +7008,289 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
ret
}
}
+
+#ifdef HAS_FIXEDDIV_X86
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) __declspec(align(16))
+int FixedDiv_X86(int num, int div) {
+ __asm {
+ mov eax, [esp + 4] // num
+ cdq // extend num to 64 bits
+ shld edx, eax, 16 // 32.16
+ shl eax, 16
+ idiv dword ptr [esp + 8]
+ ret
+ }
+}
+#endif // HAS_FIXEDDIV_X86
+
+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
+__declspec(naked) __declspec(align(16))
+void ARGBPolynomialRow_SSE2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* src_argb */
+ mov edx, [esp + 4 + 8] /* dst_argb */
+ mov esi, [esp + 4 + 12] /* poly */
+ mov ecx, [esp + 4 + 16] /* width */
+ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
+
+ // 2 pixel loop.
+ align 4
+ convertloop:
+// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
+// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
+ movq xmm0, qword ptr [eax] // BGRABGRA
+ lea eax, [eax + 8]
+ punpcklbw xmm0, xmm3
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm3 // pixel 0
+ punpckhwd xmm4, xmm3 // pixel 1
+ cvtdq2ps xmm0, xmm0 // 4 floats
+ cvtdq2ps xmm4, xmm4
+ movdqa xmm1, xmm0 // X
+ movdqa xmm5, xmm4
+ mulps xmm0, [esi + 16] // C1 * X
+ mulps xmm4, [esi + 16]
+ addps xmm0, [esi] // result = C0 + C1 * X
+ addps xmm4, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm6, xmm5
+ mulps xmm2, xmm1 // X * X
+ mulps xmm6, xmm5
+ mulps xmm1, xmm2 // X * X * X
+ mulps xmm5, xmm6
+ mulps xmm2, [esi + 32] // C2 * X * X
+ mulps xmm6, [esi + 32]
+ mulps xmm1, [esi + 48] // C3 * X * X * X
+ mulps xmm5, [esi + 48]
+ addps xmm0, xmm2 // result += C2 * X * X
+ addps xmm4, xmm6
+ addps xmm0, xmm1 // result += C3 * X * X * X
+ addps xmm4, xmm5
+ cvttps2dq xmm0, xmm0
+ cvttps2dq xmm4, xmm4
+ packuswb xmm0, xmm4
+ packuswb xmm0, xmm0
+ sub ecx, 2
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_SSE2
+
+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
+__declspec(naked) __declspec(align(16))
+void ARGBPolynomialRow_AVX2(const uint8* src_argb,
+ uint8* dst_argb, const float* poly,
+ int width) {
+ __asm {
+ mov eax, [esp + 4] /* src_argb */
+ mov edx, [esp + 8] /* dst_argb */
+ mov ecx, [esp + 12] /* poly */
+ vbroadcastf128 ymm4, [ecx] // C0
+ vbroadcastf128 ymm5, [ecx + 16] // C1
+ vbroadcastf128 ymm6, [ecx + 32] // C2
+ vbroadcastf128 ymm7, [ecx + 48] // C3
+ mov ecx, [esp + 16] /* width */
+
+ // 2 pixel loop.
+ align 4
+ convertloop:
+ vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
+ lea eax, [eax + 8]
+ vcvtdq2ps ymm0, ymm0 // X 8 floats
+ vmulps ymm2, ymm0, ymm0 // X * X
+ vmulps ymm3, ymm0, ymm7 // C3 * X
+ vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
+ vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
+ vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
+ vcvttps2dq ymm0, ymm0
+ vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
+ vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
+ vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
+ sub ecx, 2
+ vmovq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg convertloop
+ vzeroupper
+ ret
+ }
+}
+#endif // HAS_ARGBPOLYNOMIALROW_AVX2
+
+#ifdef HAS_ARGBCOLORTABLEROW_X86
+// Tranform ARGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+ int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ align 4
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ movzx edx, byte ptr [eax - 4 + 3]
+ movzx edx, byte ptr [esi + edx * 4 + 3]
+ mov byte ptr [eax - 4 + 3], dl
+ dec ecx
+ jg convertloop
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBCOLORTABLEROW_X86
+
+#ifdef HAS_RGBCOLORTABLEROW_X86
+// Tranform RGB pixels with color table.
+__declspec(naked) __declspec(align(16))
+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] /* dst_argb */
+ mov esi, [esp + 4 + 8] /* table_argb */
+ mov ecx, [esp + 4 + 12] /* width */
+
+ // 1 pixel loop.
+ align 4
+ convertloop:
+ movzx edx, byte ptr [eax]
+ lea eax, [eax + 4]
+ movzx edx, byte ptr [esi + edx * 4]
+ mov byte ptr [eax - 4], dl
+ movzx edx, byte ptr [eax - 4 + 1]
+ movzx edx, byte ptr [esi + edx * 4 + 1]
+ mov byte ptr [eax - 4 + 1], dl
+ movzx edx, byte ptr [eax - 4 + 2]
+ movzx edx, byte ptr [esi + edx * 4 + 2]
+ mov byte ptr [eax - 4 + 2], dl
+ dec ecx
+ jg convertloop
+
+ pop esi
+ ret
+ }
+}
+#endif // HAS_RGBCOLORTABLEROW_X86
+
+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
+// Tranform RGB pixels with luma table.
+__declspec(naked) __declspec(align(16))
+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+ int width,
+ const uint8* luma, uint32 lumacoeff) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] /* src_argb */
+ mov edi, [esp + 8 + 8] /* dst_argb */
+ mov ecx, [esp + 8 + 12] /* width */
+ movd xmm2, dword ptr [esp + 8 + 16] // luma table
+ movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
+ pshufd xmm2, xmm2, 0
+ pshufd xmm3, xmm3, 0
+ pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
+ psllw xmm4, 8
+ pxor xmm5, xmm5
+
+ // 4 pixel loop.
+ align 4
+ convertloop:
+ movdqu xmm0, qword ptr [eax] // generate luma ptr
+ pmaddubsw xmm0, xmm3
+ phaddw xmm0, xmm0
+ pand xmm0, xmm4 // mask out low bits
+ punpcklwd xmm0, xmm5
+ paddd xmm0, xmm2 // add table base
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi], dl
+ movzx edx, byte ptr [eax + 1]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 1], dl
+ movzx edx, byte ptr [eax + 2]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 2], dl
+ movzx edx, byte ptr [eax + 3] // copy alpha.
+ mov byte ptr [edi + 3], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 4]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 4], dl
+ movzx edx, byte ptr [eax + 5]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 5], dl
+ movzx edx, byte ptr [eax + 6]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 6], dl
+ movzx edx, byte ptr [eax + 7] // copy alpha.
+ mov byte ptr [edi + 7], dl
+
+ movd esi, xmm0
+ pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
+
+ movzx edx, byte ptr [eax + 8]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 8], dl
+ movzx edx, byte ptr [eax + 9]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 9], dl
+ movzx edx, byte ptr [eax + 10]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 10], dl
+ movzx edx, byte ptr [eax + 11] // copy alpha.
+ mov byte ptr [edi + 11], dl
+
+ movd esi, xmm0
+
+ movzx edx, byte ptr [eax + 12]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 12], dl
+ movzx edx, byte ptr [eax + 13]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 13], dl
+ movzx edx, byte ptr [eax + 14]
+ movzx edx, byte ptr [esi + edx]
+ mov byte ptr [edi + 14], dl
+ movzx edx, byte ptr [eax + 15] // copy alpha.
+ mov byte ptr [edi + 15], dl
+
+ sub ecx, 4
+ lea eax, [eax + 16]
+ lea edi, [edi + 16]
+ jg convertloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
+
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/row_x86.asm b/chromium/third_party/libyuv/source/row_x86.asm
index 80a9716bae2..0cb326f8e58 100644
--- a/chromium/third_party/libyuv/source/row_x86.asm
+++ b/chromium/third_party/libyuv/source/row_x86.asm
@@ -28,7 +28,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
psrlw m2, m2, 8
%endif
- ALIGN 16
+ ALIGN 4
.convertloop:
mov%2 m0, [src_yuy2q]
mov%2 m1, [src_yuy2q + mmsize]
@@ -74,7 +74,7 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
psrlw m4, m4, 8
sub dst_vq, dst_uq
- ALIGN 16
+ ALIGN 4
.convertloop:
mov%1 m0, [src_uvq]
mov%1 m1, [src_uvq + mmsize]
@@ -113,7 +113,7 @@ SplitUVRow a,
cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
sub src_vq, src_uq
- ALIGN 16
+ ALIGN 4
.convertloop:
mov%1 m0, [src_uq]
mov%1 m1, [src_vq]
diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc
index 77af420b3f3..4f19742a205 100644
--- a/chromium/third_party/libyuv/source/scale.cc
+++ b/chromium/third_party/libyuv/source/scale.cc
@@ -16,1657 +16,21 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h" // For CopyPlane
#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
+// Remove this macro if OVERREAD is safe.
+#define AVOID_OVERREAD 1
+
static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
-static __inline int Half(int v) {
- return v >= 0 ? ((v + 1) >> 1) : -((-v + 1) >> 1);
-}
-
-// Note: Some SSE2 reference manuals
-// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
-
-// Set the following flag to true to revert to only
-// using the reference implementation ScalePlaneBox(), and
-// NOT the optimized versions. Useful for debugging and
-// when comparing the quality of the resulting YUV planes
-// as produced by the optimized and non-optimized versions.
-static bool use_reference_impl_ = false;
-
-LIBYUV_API
-void SetUseReferenceImpl(bool use) {
- use_reference_impl_ = use;
-}
-
-// ScaleRowDown2Int also used by planar functions
-// NEON downscalers with interpolation.
-
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEROWDOWN2_NEON
-// Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width);
-
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-
-#define HAS_SCALEROWDOWN4_NEON
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN34_NEON
-// Down scale from 4 to 3 pixels. Use the neon multilane read/write
-// to load up the every 4th pixel into a 4 different registers.
-// Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-#define HAS_SCALEROWDOWN38_NEON
-// 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
- ptrdiff_t /* src_stride */,
- uint8* dst_ptr, int dst_width);
-// 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-// 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-
-// SSE2 downscalers with interpolation.
-// Constants for SSSE3 code
-#elif !defined(LIBYUV_DISABLE_X86) && \
- (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-
-// Offsets for source bytes 0 to 9
-CONST uvec8 kShuf0 =
- { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-CONST uvec8 kShuf1 =
- { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf2 =
- { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Offsets for source bytes 0 to 10
-CONST uvec8 kShuf01 =
- { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
-
-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-CONST uvec8 kShuf11 =
- { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
-
-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-CONST uvec8 kShuf21 =
- { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
-
-// Coefficients for source bytes 0 to 10
-CONST uvec8 kMadd01 =
- { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
-
-// Coefficients for source bytes 10 to 21
-CONST uvec8 kMadd11 =
- { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
-
-// Coefficients for source bytes 21 to 31
-CONST uvec8 kMadd21 =
- { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
-
-// Coefficients for source bytes 21 to 31
-CONST vec16 kRound34 =
- { 2, 2, 2, 2, 2, 2, 2, 2 };
-
-CONST uvec8 kShuf38a =
- { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-CONST uvec8 kShuf38b =
- { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 0,1,2
-CONST uvec8 kShufAc =
- { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
-
-// Arrange words 0,3,6 into 3,4,5
-CONST uvec8 kShufAc3 =
- { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x3 and 2x3
-CONST uvec16 kScaleAc33 =
- { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
-
-// Arrange first value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb0 =
- { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
-
-// Arrange second value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb1 =
- { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
-
-// Arrange third value for pixels 0,1,2,3,4,5
-CONST uvec8 kShufAb2 =
- { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
-
-// Scaling values for boxes of 3x2 and 2x2
-CONST uvec16 kScaleAb2 =
- { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_SCALEROWDOWN2_SSE2
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Reads 32 pixels, throws half away and writes 16 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
-
- align 16
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- lea eax, [eax + 32]
- psrlw xmm0, 8 // isolate odd pixels.
- psrlw xmm1, 8
- packuswb xmm0, xmm1
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 16x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
- psrlw xmm5, 8
-
- align 16
- wloop:
- movdqu xmm0, [eax]
- movdqu xmm1, [eax + 16]
- movdqu xmm2, [eax + esi]
- movdqu xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm5
- pand xmm3, xmm5
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- sub ecx, 16
- movdqu [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-// Point samples 32 pixels to 8 pixels.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
- psrld xmm5, 24
- pslld xmm5, 16
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- pand xmm0, xmm5
- pand xmm1, xmm5
- packuswb xmm0, xmm1
- psrlw xmm0, 8
- packuswb xmm0, xmm0
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x4 rectangle to 8x1.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- push edi
- mov eax, [esp + 8 + 4] // src_ptr
- mov esi, [esp + 8 + 8] // src_stride
- mov edx, [esp + 8 + 12] // dst_ptr
- mov ecx, [esp + 8 + 16] // dst_width
- lea edi, [esi + esi * 2] // src_stride * 3
- pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
- psrlw xmm7, 8
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, [eax + esi * 2]
- movdqa xmm3, [eax + esi * 2 + 16]
- movdqa xmm4, [eax + edi]
- movdqa xmm5, [eax + edi + 16]
- lea eax, [eax + 32]
- pavgb xmm2, xmm4
- pavgb xmm3, xmm5
- pavgb xmm0, xmm2
- pavgb xmm1, xmm3
-
- movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
- psrlw xmm0, 8
- movdqa xmm3, xmm1
- psrlw xmm1, 8
- pand xmm2, xmm7
- pand xmm3, xmm7
- pavgw xmm0, xmm2
- pavgw xmm1, xmm3
- packuswb xmm0, xmm1
-
- movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
- psrlw xmm0, 8
- pand xmm2, xmm7
- pavgw xmm0, xmm2
- packuswb xmm0, xmm0
-
- sub ecx, 8
- movq qword ptr [edx], xmm0
- lea edx, [edx + 8]
- jg wloop
-
- pop edi
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-// Point samples 32 pixels to 24 pixels.
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm3, kShuf0
- movdqa xmm4, kShuf1
- movdqa xmm5, kShuf2
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- movdqa xmm2, xmm1
- palignr xmm1, xmm0, 8
- pshufb xmm0, xmm3
- pshufb xmm1, xmm4
- pshufb xmm2, xmm5
- movq qword ptr [edx], xmm0
- movq qword ptr [edx + 8], xmm1
- movq qword ptr [edx + 16], xmm2
- lea edx, [edx + 24]
- sub ecx, 24
- jg wloop
-
- ret
- }
-}
-
-// Blends 32x2 rectangle to 24x1
-// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
-// Then shuffled to do the scaling.
-
-// Register usage:
-// xmm0 src_row 0
-// xmm1 src_row 1
-// xmm2 shuf 0
-// xmm3 shuf 1
-// xmm4 shuf 2
-// xmm5 madd 0
-// xmm6 madd 1
-// xmm7 kRound34
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShuf01
- movdqa xmm3, kShuf11
- movdqa xmm4, kShuf21
- movdqa xmm5, kMadd01
- movdqa xmm6, kMadd11
- movdqa xmm7, kRound34
-
- align 16
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- sub ecx, 24
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx + 24]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-// Note that movdqa+palign may be better than movdqu.
-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShuf01
- movdqa xmm3, kShuf11
- movdqa xmm4, kShuf21
- movdqa xmm5, kMadd01
- movdqa xmm6, kMadd11
- movdqa xmm7, kRound34
-
- align 16
- wloop:
- movdqa xmm0, [eax] // pixels 0..7
- movdqa xmm1, [eax + esi]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm2
- pmaddubsw xmm0, xmm5
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx], xmm0
- movdqu xmm0, [eax + 8] // pixels 8..15
- movdqu xmm1, [eax + esi + 8]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm3
- pmaddubsw xmm0, xmm6
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- movq qword ptr [edx + 8], xmm0
- movdqa xmm0, [eax + 16] // pixels 16..23
- movdqa xmm1, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm1, xmm0
- pavgb xmm0, xmm1
- pshufb xmm0, xmm4
- movdqa xmm1, kMadd21
- pmaddubsw xmm0, xmm1
- paddsw xmm0, xmm7
- psrlw xmm0, 2
- packuswb xmm0, xmm0
- sub ecx, 24
- movq qword ptr [edx + 16], xmm0
- lea edx, [edx+24]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-// 3/8 point sampler
-
-// Scale 32 pixels to 12
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_ptr
- // src_stride ignored
- mov edx, [esp + 12] // dst_ptr
- mov ecx, [esp + 16] // dst_width
- movdqa xmm4, kShuf38a
- movdqa xmm5, kShuf38b
-
- align 16
- xloop:
- movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
- movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
- lea eax, [eax + 32]
- pshufb xmm0, xmm4
- pshufb xmm1, xmm5
- paddusb xmm0, xmm1
-
- sub ecx, 12
- movq qword ptr [edx], xmm0 // write 12 pixels
- movhlps xmm1, xmm0
- movd [edx + 8], xmm1
- lea edx, [edx + 12]
- jg xloop
-
- ret
- }
-}
-
-// Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShufAc
- movdqa xmm3, kShufAc3
- movdqa xmm4, kScaleAc33
- pxor xmm5, xmm5
-
- align 16
- xloop:
- movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
- movdqa xmm6, [eax + esi]
- movhlps xmm1, xmm0
- movhlps xmm7, xmm6
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
- movdqa xmm6, [eax + esi * 2]
- lea eax, [eax + 16]
- movhlps xmm7, xmm6
- punpcklbw xmm6, xmm5
- punpcklbw xmm7, xmm5
- paddusw xmm0, xmm6
- paddusw xmm1, xmm7
-
- movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- psrldq xmm0, 2
- paddusw xmm6, xmm0
- pshufb xmm6, xmm2
-
- movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- psrldq xmm1, 2
- paddusw xmm7, xmm1
- pshufb xmm7, xmm3
- paddusw xmm6, xmm7
-
- pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
- packuswb xmm6, xmm6
-
- sub ecx, 6
- movd [edx], xmm6 // write 6 pixels
- psrlq xmm6, 16
- movd [edx + 2], xmm6
- lea edx, [edx + 6]
- jg xloop
-
- pop esi
- ret
- }
-}
-
-// Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked) __declspec(align(16))
-static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_ptr
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_ptr
- mov ecx, [esp + 4 + 16] // dst_width
- movdqa xmm2, kShufAb0
- movdqa xmm3, kShufAb1
- movdqa xmm4, kShufAb2
- movdqa xmm5, kScaleAb2
-
- align 16
- xloop:
- movdqa xmm0, [eax] // average 2 rows into xmm0
- pavgb xmm0, [eax + esi]
- lea eax, [eax + 16]
-
- movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
- pshufb xmm1, xmm2
- movdqa xmm6, xmm0
- pshufb xmm6, xmm3
- paddusw xmm1, xmm6
- pshufb xmm0, xmm4
- paddusw xmm1, xmm0
-
- pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
- packuswb xmm1, xmm1
-
- sub ecx, 6
- movd [edx], xmm1 // write 6 pixels
- psrlq xmm1, 16
- movd [edx + 2], xmm1
- lea edx, [edx + 6]
- jg xloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEADDROWS_SSE2
-
-// Reads 16xN bytes and produces 16 shorts at a time.
-__declspec(naked) __declspec(align(16))
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width,
- int src_height) {
- __asm {
- push esi
- push edi
- push ebx
- push ebp
- mov esi, [esp + 16 + 4] // src_ptr
- mov edx, [esp + 16 + 8] // src_stride
- mov edi, [esp + 16 + 12] // dst_ptr
- mov ecx, [esp + 16 + 16] // dst_width
- mov ebx, [esp + 16 + 20] // height
- pxor xmm4, xmm4
- dec ebx
-
- align 16
- xloop:
- // first row
- movdqa xmm0, [esi]
- lea eax, [esi + edx]
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm4
- punpckhbw xmm1, xmm4
- lea esi, [esi + 16]
- mov ebp, ebx
- test ebp, ebp
- je ydone
-
- // sum remaining rows
- align 16
- yloop:
- movdqa xmm2, [eax] // read 16 pixels
- lea eax, [eax + edx] // advance to next row
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm4
- punpckhbw xmm3, xmm4
- paddusw xmm0, xmm2 // sum 16 words
- paddusw xmm1, xmm3
- sub ebp, 1
- jg yloop
- ydone:
- movdqa [edi], xmm0
- movdqa [edi + 16], xmm1
- lea edi, [edi + 32]
-
- sub ecx, 16
- jg xloop
-
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
- }
-}
-
-#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEROWDOWN2_SSE2
-static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%3,1),%%xmm2 \n"
- "movdqa 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "psrlw $0x8,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrlw $0x8,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm0 \n"
- "movdqu 0x10(%0),%%xmm1 \n"
- "movdqu (%0,%3,1),%%xmm2 \n"
- "movdqu 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm5,%%xmm2 \n"
- "pand %%xmm5,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqu %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "sub $0x10,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
-}
-
-#define HAS_SCALEROWDOWN4_SSE2
-static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "pcmpeqb %%xmm5,%%xmm5 \n"
- "psrld $0x18,%%xmm5 \n"
- "pslld $0x10,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pand %%xmm5,%%xmm0 \n"
- "pand %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "psrlw $0x8,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- intptr_t stridex3 = 0;
- asm volatile (
- "pcmpeqb %%xmm7,%%xmm7 \n"
- "psrlw $0x8,%%xmm7 \n"
- "lea (%4,%4,2),%3 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%4,1),%%xmm2 \n"
- "movdqa 0x10(%0,%4,1),%%xmm3 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa (%0,%4,2),%%xmm2 \n"
- "movdqa 0x10(%0,%4,2),%%xmm3 \n"
- "movdqa (%0,%3,1),%%xmm4 \n"
- "movdqa 0x10(%0,%3,1),%%xmm5 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm4,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm5,%%xmm3 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psrlw $0x8,%%xmm1 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pand %%xmm7,%%xmm3 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "pavgw %%xmm3,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm0 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "psrlw $0x8,%%xmm0 \n"
- "pand %%xmm7,%%xmm2 \n"
- "pavgw %%xmm2,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "lea 0x8(%1),%1 \n"
- "sub $0x8,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width), // %2
- "+r"(stridex3) // %3
- : "r"(static_cast<intptr_t>(src_stride)) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
-#endif
- );
-}
-
-#define HAS_SCALEROWDOWN34_SSSE3
-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm3 \n"
- "movdqa %1,%%xmm4 \n"
- "movdqa %2,%%xmm5 \n"
- :
- : "m"(kShuf0), // %0
- "m"(kShuf1), // %1
- "m"(kShuf2) // %2
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm2 \n"
- "lea 0x20(%0),%0 \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "palignr $0x8,%%xmm0,%%xmm1 \n"
- "pshufb %%xmm3,%%xmm0 \n"
- "pshufb %%xmm4,%%xmm1 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "movq %%xmm0,(%1) \n"
- "movq %%xmm1,0x8(%1) \n"
- "movq %%xmm2,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%3),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3),%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqa 0x10(%0),%%xmm6 \n"
- "movdqa 0x10(%0,%3),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n" // kShuf01
- "movdqa %1,%%xmm3 \n" // kShuf11
- "movdqa %2,%%xmm4 \n" // kShuf21
- :
- : "m"(kShuf01), // %0
- "m"(kShuf11), // %1
- "m"(kShuf21) // %2
- );
- asm volatile (
- "movdqa %0,%%xmm5 \n" // kMadd01
- "movdqa %1,%%xmm0 \n" // kMadd11
- "movdqa %2,%%xmm1 \n" // kRound34
- :
- : "m"(kMadd01), // %0
- "m"(kMadd11), // %1
- "m"(kRound34) // %2
- );
-
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm6 \n"
- "movdqa (%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "pmaddubsw %%xmm5,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,(%1) \n"
- "movdqu 0x8(%0),%%xmm6 \n"
- "movdqu 0x8(%0,%3,1),%%xmm7 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "pmaddubsw %%xmm0,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x8(%1) \n"
- "movdqa 0x10(%0),%%xmm6 \n"
- "movdqa 0x10(%0,%3,1),%%xmm7 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm6,%%xmm7 \n"
- "pavgb %%xmm7,%%xmm6 \n"
- "pshufb %%xmm4,%%xmm6 \n"
- "pmaddubsw %4,%%xmm6 \n"
- "paddsw %%xmm1,%%xmm6 \n"
- "psrlw $0x2,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "movq %%xmm6,0x10(%1) \n"
- "lea 0x18(%1),%1 \n"
- "sub $0x18,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)), // %3
- "m"(kMadd21) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-#define HAS_SCALEROWDOWN38_SSSE3
-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %3,%%xmm4 \n"
- "movdqa %4,%%xmm5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "paddusb %%xmm1,%%xmm0 \n"
- "movq %%xmm0,(%1) \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movd %%xmm1,0x8(%1) \n"
- "lea 0xc(%1),%1 \n"
- "sub $0xc,%2 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "m"(kShuf38a), // %3
- "m"(kShuf38b) // %4
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm4", "xmm5"
-#endif
- );
-}
-
-static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "movdqa %3,%%xmm5 \n"
- :
- : "m"(kShufAb0), // %0
- "m"(kShufAb1), // %1
- "m"(kShufAb2), // %2
- "m"(kScaleAb2) // %3
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "pavgb (%0,%3,1),%%xmm0 \n"
- "lea 0x10(%0),%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "pshufb %%xmm2,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "pshufb %%xmm3,%%xmm6 \n"
- "paddusw %%xmm6,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm1 \n"
- "pmulhuw %%xmm5,%%xmm1 \n"
- "packuswb %%xmm1,%%xmm1 \n"
- "sub $0x6,%2 \n"
- "movd %%xmm1,(%1) \n"
- "psrlq $0x10,%%xmm1 \n"
- "movd %%xmm1,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-
-static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- asm volatile (
- "movdqa %0,%%xmm2 \n"
- "movdqa %1,%%xmm3 \n"
- "movdqa %2,%%xmm4 \n"
- "pxor %%xmm5,%%xmm5 \n"
- :
- : "m"(kShufAc), // %0
- "m"(kShufAc3), // %1
- "m"(kScaleAc33) // %2
- );
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa (%0,%3,1),%%xmm6 \n"
- "movhlps %%xmm0,%%xmm1 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm0 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa (%0,%3,2),%%xmm6 \n"
- "lea 0x10(%0),%0 \n"
- "movhlps %%xmm6,%%xmm7 \n"
- "punpcklbw %%xmm5,%%xmm6 \n"
- "punpcklbw %%xmm5,%%xmm7 \n"
- "paddusw %%xmm6,%%xmm0 \n"
- "paddusw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "psrldq $0x2,%%xmm0 \n"
- "paddusw %%xmm0,%%xmm6 \n"
- "pshufb %%xmm2,%%xmm6 \n"
- "movdqa %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "psrldq $0x2,%%xmm1 \n"
- "paddusw %%xmm1,%%xmm7 \n"
- "pshufb %%xmm3,%%xmm7 \n"
- "paddusw %%xmm7,%%xmm6 \n"
- "pmulhuw %%xmm4,%%xmm6 \n"
- "packuswb %%xmm6,%%xmm6 \n"
- "sub $0x6,%2 \n"
- "movd %%xmm6,(%1) \n"
- "psrlq $0x10,%%xmm6 \n"
- "movd %%xmm6,0x2(%1) \n"
- "lea 0x6(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
-}
-
-#define HAS_SCALEADDROWS_SSE2
-static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
- int tmp_height = 0;
- intptr_t tmp_src = 0;
- asm volatile (
- "pxor %%xmm4,%%xmm4 \n"
- "sub $0x1,%5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "mov %0,%3 \n"
- "add %6,%0 \n"
- "movdqa %%xmm0,%%xmm1 \n"
- "punpcklbw %%xmm4,%%xmm0 \n"
- "punpckhbw %%xmm4,%%xmm1 \n"
- "mov %5,%2 \n"
- "test %2,%2 \n"
- "je 3f \n"
- "2: \n"
- "movdqa (%0),%%xmm2 \n"
- "add %6,%0 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklbw %%xmm4,%%xmm2 \n"
- "punpckhbw %%xmm4,%%xmm3 \n"
- "paddusw %%xmm2,%%xmm0 \n"
- "paddusw %%xmm3,%%xmm1 \n"
- "sub $0x1,%2 \n"
- "jg 2b \n"
- "3: \n"
- "movdqa %%xmm0,(%1) \n"
- "movdqa %%xmm1,0x10(%1) \n"
- "lea 0x10(%3),%0 \n"
- "lea 0x20(%1),%1 \n"
- "sub $0x10,%4 \n"
- "jg 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(tmp_height), // %2
- "+r"(tmp_src), // %3
- "+r"(src_width), // %4
- "+rm"(src_height) // %5
- : "rm"(static_cast<intptr_t>(src_stride)) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-#endif
- );
-}
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_MIPS_DSPR2
-void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width);
-void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-#define HAS_SCALEROWDOWN4_MIPS_DSPR2
-void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width);
-void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-#define HAS_SCALEROWDOWN34_MIPS_DSPR2
-void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width);
-#define HAS_SCALEROWDOWN38_MIPS_DSPR2
-void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width);
-#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-// CPU agnostic row functions
-static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = src_ptr[1];
- dst[1] = src_ptr[3];
- dst += 2;
- src_ptr += 4;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = src_ptr[1];
- }
-}
-
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
- dst += 2;
- s += 4;
- t += 4;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
- }
-}
-
-static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = src_ptr[2];
- dst[1] = src_ptr[6];
- dst += 2;
- src_ptr += 8;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = src_ptr[2];
- }
-}
-
-static void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- intptr_t stride = src_stride;
- uint8* dend = dst + dst_width - 1;
- do {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 4] + src_ptr[stride + 5] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
- src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
- src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
- 8) >> 4;
- dst += 2;
- src_ptr += 8;
- } while (dst < dend);
- if (dst_width & 1) {
- dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride + 3] +
- src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
- src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
- src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
- 8) >> 4;
- }
-}
-
-static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- uint8* dend = dst + dst_width;
- do {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[1];
- dst[2] = src_ptr[3];
- dst += 3;
- src_ptr += 4;
- } while (dst < dend);
-}
-
-// Filter rows 0 and 1 together, 3 : 1
-static void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- uint8* dend = d + dst_width;
- do {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 * 3 + b0 + 2) >> 2;
- d[1] = (a1 * 3 + b1 + 2) >> 2;
- d[2] = (a2 * 3 + b2 + 2) >> 2;
- d += 3;
- s += 4;
- t += 4;
- } while (d < dend);
-}
-
-// Filter rows 1 and 2 together, 1 : 1
-static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* d, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- const uint8* s = src_ptr;
- const uint8* t = src_ptr + src_stride;
- uint8* dend = d + dst_width;
- do {
- uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
- uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
- uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
- uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
- uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
- uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
- d[0] = (a0 + b0 + 1) >> 1;
- d[1] = (a1 + b1 + 1) >> 1;
- d[2] = (a2 + b2 + 1) >> 1;
- d += 3;
- s += 4;
- t += 4;
- } while (d < dend);
-}
-
-// (1-f)a + fb can be replaced with a + f(b-a)
-#define BLENDER(a, b, f) (static_cast<int>(a) + \
- ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
-
-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
- int dst_width, int x, int dx) {
- for (int j = 0; j < dst_width - 1; j += 2) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- x += dx;
- xi = x >> 16;
- a = src_ptr[xi];
- b = src_ptr[xi + 1];
- dst_ptr[1] = BLENDER(a, b, x & 0xffff);
- x += dx;
- dst_ptr += 2;
- }
- if (dst_width & 1) {
- int xi = x >> 16;
- int a = src_ptr[xi];
- int b = src_ptr[xi + 1];
- dst_ptr[0] = BLENDER(a, b, x & 0xffff);
- }
-}
-
-static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- assert(dst_width % 3 == 0);
- for (int x = 0; x < dst_width; x += 3) {
- dst[0] = src_ptr[0];
- dst[1] = src_ptr[3];
- dst[2] = src_ptr[6];
- dst += 3;
- src_ptr += 8;
- }
-}
-
-// 8x3 -> 3x1
-static void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
- ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- intptr_t stride = src_stride;
- for (int i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
- src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
- (65536 / 9) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
- src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
- (65536 / 9) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7] +
- src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
- (65536 / 6) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-// 8x2 -> 3x1
-static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst_ptr, int dst_width) {
- assert((dst_width % 3 == 0) && (dst_width > 0));
- intptr_t stride = src_stride;
- for (int i = 0; i < dst_width; i += 3) {
- dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
- src_ptr[stride + 0] + src_ptr[stride + 1] +
- src_ptr[stride + 2]) * (65536 / 6) >> 16;
- dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
- src_ptr[stride + 3] + src_ptr[stride + 4] +
- src_ptr[stride + 5]) * (65536 / 6) >> 16;
- dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
- src_ptr[stride + 6] + src_ptr[stride + 7]) *
- (65536 / 4) >> 16;
- src_ptr += 8;
- dst_ptr += 3;
- }
-}
-
-void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
- uint16* dst_ptr, int src_width, int src_height) {
- assert(src_width > 0);
- assert(src_height > 0);
- for (int x = 0; x < src_width; ++x) {
- const uint8* s = src_ptr + x;
- int sum = 0;
- for (int y = 0; y < src_height; ++y) {
- sum += s[0];
- s += src_stride;
- }
- dst_ptr[x] = sum;
- }
-}
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
// Scale plane, 1/2
// This is an optimized version for scaling down a plane to 1/2 of
@@ -1679,7 +43,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
FilterMode filtering) {
void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) =
- filtering ? ScaleRowDown2Box_C : ScaleRowDown2_C;
+ filtering == kFilterNone ? ScaleRowDown2_C :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_C :
+ ScaleRowDown2Box_C);
int row_stride = src_stride << 1;
if (!filtering) {
src_ptr += src_stride; // Point to odd rows.
@@ -1692,12 +58,15 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
}
#elif defined(HAS_SCALEROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Box_Unaligned_SSE2 :
- ScaleRowDown2_Unaligned_SSE2;
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :
+ ScaleRowDown2Box_Unaligned_SSE2);
if (IS_ALIGNED(src_ptr, 16) &&
IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleRowDown2 = filtering ? ScaleRowDown2Box_SSE2 : ScaleRowDown2_SSE2;
+ ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :
+ ScaleRowDown2Box_SSE2);
}
}
#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)
@@ -1709,6 +78,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
}
#endif
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
// TODO(fbarchard): Loop through source height to allow odd height.
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
@@ -1753,6 +125,9 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
}
#endif
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
for (int y = 0; y < dst_height; ++y) {
ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
src_ptr += row_stride;
@@ -1816,14 +191,15 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
}
#endif
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
for (int y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
- ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
+ ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
dst_ptr, dst_width);
src_ptr += src_stride * 2;
dst_ptr += dst_stride;
@@ -1831,7 +207,7 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
// Remainder 1 or 2 rows with last row vertically unfiltered
if ((dst_height % 3) == 2) {
- ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride;
dst_ptr += dst_stride;
ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
@@ -1908,21 +284,22 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
}
#endif
+ const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
for (int y = 0; y < dst_height - 2; y += 3) {
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride * 3;
dst_ptr += dst_stride;
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride * 3;
dst_ptr += dst_stride;
- ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride * 2;
dst_ptr += dst_stride;
}
// Remainder 1 or 2 rows with last row vertically unfiltered
if ((dst_height % 3) == 2) {
- ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
+ ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
src_ptr += src_stride * 3;
dst_ptr += dst_stride;
ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
@@ -1998,24 +375,22 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
// one pixel of destination using fixed point (16.16) to step
// through source, sampling a box of pixel with simple
// averaging.
-
+SAFEBUFFERS
static void ScalePlaneBox(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
assert(dst_width > 0);
assert(dst_height > 0);
- int dx = (Abs(src_width) << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
+
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
- // Negative src_width means horizontally mirror.
- if (src_width < 0) {
- x += (dst_width - 1) * dx;
- dx = -dx;
- src_width = -src_width;
- }
- int maxy = (src_height << 16);
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
+ &x, &y, &dx, &dy);
+ const int max_y = (src_height << 16);
if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxStride) ||
dst_height * 2 > src_height) {
uint8* dst = dst_ptr;
@@ -2023,8 +398,8 @@ static void ScalePlaneBox(int src_width, int src_height,
int iy = y >> 16;
const uint8* src = src_ptr + iy * src_stride;
y += dy;
- if (y > maxy) {
- y = maxy;
+ if (y > max_y) {
+ y = max_y;
}
int boxheight = (y >> 16) - iy;
ScalePlaneBoxRow_C(dst_width, boxheight,
@@ -2046,6 +421,9 @@ static void ScalePlaneBox(int src_width, int src_height,
}
#if defined(HAS_SCALEADDROWS_SSE2)
if (TestCpuFlag(kCpuHasSSE2) &&
+#ifdef AVOID_OVERREAD
+ IS_ALIGNED(src_width, 16) &&
+#endif
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
ScaleAddRows = ScaleAddRows_SSE2;
}
@@ -2066,16 +444,25 @@ static void ScalePlaneBox(int src_width, int src_height,
}
}
-// Scale plane to/from any dimensions, with bilinear interpolation.
-
-void ScalePlaneBilinear(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr) {
+// Scale plane down with bilinear interpolation.
+SAFEBUFFERS
+void ScalePlaneBilinearDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
assert(dst_width > 0);
assert(dst_height > 0);
assert(Abs(src_width) <= kMaxStride);
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+
SIMD_ALIGNED(uint8 row[kMaxStride + 16]);
void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
@@ -2103,6 +490,14 @@ void ScalePlaneBilinear(int src_width, int src_height,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {
InterpolateRow = InterpolateRow_Any_NEON;
@@ -2119,44 +514,170 @@ void ScalePlaneBilinear(int src_width, int src_height,
}
}
#endif
- int dx = 0;
- int dy = 0;
+
+ void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) = ScaleFilterCols_C;
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+
+ const int max_y = (src_height - 1) << 16;
+ for (int j = 0; j < dst_height; ++j) {
+ if (y > max_y) {
+ y = max_y;
+ }
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, src_width, yf);
+ ScaleFilterCols(dst_ptr, row, dst_width, x, dx);
+ }
+ dst_ptr += dst_stride;
+ y += dy;
+ }
+}
+
+// Scale up down with bilinear interpolation.
+SAFEBUFFERS
+void ScalePlaneBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_ptr, uint8* dst_ptr,
+ FilterMode filtering) {
+ assert(src_width != 0);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ assert(Abs(dst_width) <= kMaxStride);
+
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
int x = 0;
int y = 0;
- if (dst_width <= Abs(src_width)) {
- dx = (Abs(src_width) << 16) / dst_width;
- x = (dx >> 1) - 32768;
- } else if (dst_width > 1) {
- dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+
+ void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
}
- // Negative src_width means horizontally mirror.
- if (src_width < 0) {
- x += (dst_width - 1) * dx;
- dx = -dx;
- src_width = -src_width;
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
}
- if (dst_height <= src_height) {
- dy = (src_height << 16) / dst_height;
- y = (dy >> 1) - 32768;
- } else if (dst_height > 1) {
- dy = ((src_height - 1) << 16) / (dst_height - 1);
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
}
- int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleFilterCols_C : ScaleCols_C;
+#if defined(HAS_SCALEFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleFilterCols = ScaleFilterCols_SSSE3;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleFilterCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleFilterCols = ScaleColsUp2_SSE2;
+ }
+#endif
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ int yi = y >> 16;
+ const uint8* src = src_ptr + yi * src_stride;
+ SIMD_ALIGNED(uint8 row[2 * kMaxStride]);
+ uint8* rowptr = row;
+ int rowstride = kMaxStride;
+ int lasty = yi;
+
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
for (int j = 0; j < dst_height; ++j) {
- if (y > maxy) {
- y = maxy;
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_ptr + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);
}
- int yi = y >> 16;
- int yf = (y >> 8) & 255;
- const uint8* src = src_ptr + yi * src_stride;
- InterpolateRow(row, src, src_stride, src_width, yf);
- ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
dst_ptr += dst_stride;
y += dy;
}
}
-// Scale plane to/from any dimensions, without interpolation.
+// Scale Plane to/from any dimensions, without interpolation.
// Fixed point math is used for performance: The upper 16 bits
// of x and dx is the integer part of the source position and
// the lower 16 bits are the fixed decimal part.
@@ -2165,74 +686,37 @@ static void ScalePlaneSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_ptr, uint8* dst_ptr) {
- int dx = (Abs(src_width) << 16) / dst_width;
- int dy = (src_height << 16) / dst_height;
- int x = dx >> 1;
- int y = dy >> 1;
- // Negative src_width means horizontally mirror.
- if (src_width < 0) {
- x += (dst_width - 1) * dx;
- dx = -dx;
- src_width = -src_width;
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
+ &x, &y, &dx, &dy);
+
+ void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) = ScaleCols_C;
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleCols = ScaleColsUp2_C;
+#if defined(HAS_SCALECOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleCols = ScaleColsUp2_SSE2;
+ }
+#endif
}
- for (int j = 0; j < dst_height; ++j) {
- int xs = x;
- int yi = y >> 16;
- const uint8* src = src_ptr + yi * src_stride;
- uint8* dst = dst_ptr;
- for (int i = 0; i < dst_width; ++i) {
- *dst++ = src[xs >> 16];
- xs += dx;
- }
+ for (int i = 0; i < dst_height; ++i) {
+ ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
+ dst_width, x, dx);
dst_ptr += dst_stride;
y += dy;
}
}
-// Scale plane to/from any dimensions.
-
-static void ScalePlaneAnySize(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- if (!filtering || src_width > kMaxStride) {
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else {
- ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- }
-}
-
-// Scale plane down, any size
-//
-// This is an optimized version for scaling down a plane to any size.
-// The current implementation is ~10 times faster compared to the
-// reference implementation for e.g. XGA->LowResPAL
-
-static void ScalePlaneDown(int src_width, int src_height,
- int dst_width, int dst_height,
- int src_stride, int dst_stride,
- const uint8* src_ptr, uint8* dst_ptr,
- FilterMode filtering) {
- if (!filtering || src_width > kMaxStride) {
- ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else if (filtering == kFilterBilinear || dst_height * 2 > src_height) {
- // between 1/2x and 1x use bilinear
- ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- } else {
- ScalePlaneBox(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src_ptr, dst_ptr);
- }
-}
-
// Scale a plane.
-// This function in turn calls a scaling function suitable for handling
-// the desired resolutions.
+// This function dispatches to a specialized scaler based on scale factor.
LIBYUV_API
void ScalePlane(const uint8* src, int src_stride,
@@ -2240,53 +724,87 @@ void ScalePlane(const uint8* src, int src_stride,
uint8* dst, int dst_stride,
int dst_width, int dst_height,
FilterMode filtering) {
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
+
+ // Negative height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
// Use specialized scales to improve performance for common resolutions.
// For example, all the 1/2 scalings will use ScalePlaneDown2()
if (dst_width == src_width && dst_height == src_height) {
// Straight copy.
CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
- } else if (dst_width <= Abs(src_width) && dst_height <= src_height) {
+ return;
+ }
+ if (dst_width == src_width) {
+ int dy = FixedDiv(src_height, dst_height);
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical(src_height,
+ dst_width, dst_height,
+ src_stride, dst_stride, src, dst,
+ 0, 0, dy, 1, filtering);
+ return;
+ }
+ if (dst_width <= Abs(src_width) && dst_height <= src_height) {
// Scale down.
- if (use_reference_impl_) {
- // For testing, allow the optimized versions to be disabled.
- ScalePlaneDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
- } else if (4 * dst_width == 3 * src_width &&
- 4 * dst_height == 3 * src_height) {
+ if (4 * dst_width == 3 * src_width &&
+ 4 * dst_height == 3 * src_height) {
// optimized, 3/4
ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
+ return;
+ }
+ if (2 * dst_width == src_width && 2 * dst_height == src_height) {
// optimized, 1/2
ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
// 3/8 rounded up for odd sized chroma height.
- } else if (8 * dst_width == 3 * src_width &&
- dst_height == ((src_height * 3 + 7) / 8)) {
+ if (8 * dst_width == 3 * src_width &&
+ dst_height == ((src_height * 3 + 7) / 8)) {
// optimized, 3/8
ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
+ return;
+ }
+ if (4 * dst_width == src_width && 4 * dst_height == src_height &&
filtering != kFilterBilinear) {
// optimized, 1/4
ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
- } else {
- // Arbitrary downsample
- ScalePlaneDown(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
+ return;
}
- } else {
- // Arbitrary scale up and/or down.
- ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
- src_stride, dst_stride, src, dst, filtering);
}
+ if (filtering == kFilterBox && src_width <= kMaxStride &&
+ dst_height * 2 < src_height ) {
+ ScalePlaneBox(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
+ return;
+ }
+ if (filtering && dst_height > src_height && dst_width <= kMaxStride) {
+ ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ if (filtering && src_width <= kMaxStride) {
+ ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst, filtering);
+ return;
+ }
+ ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
+ src_stride, dst_stride, src, dst);
}
// Scale an I420 image.
// This function in turn calls a scaling function for each plane.
-// TODO(fbarchard): Disable UNDER_ALLOCATED_HACK
-#define UNDER_ALLOCATED_HACK 1
LIBYUV_API
int I420Scale(const uint8* src_y, int src_stride_y,
@@ -2299,47 +817,13 @@ int I420Scale(const uint8* src_y, int src_stride_y,
int dst_width, int dst_height,
FilterMode filtering) {
if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 ||
- src_width > 32767 || src_height > 32767) {
+ !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
return -1;
}
- // Negative height means invert the image.
- if (src_height < 0) {
- src_height = -src_height;
- int halfheight = Half(src_height);
- src_y = src_y + (src_height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int src_halfwidth = Half(src_width);
- int src_halfheight = Half(src_height);
- int dst_halfwidth = Half(dst_width);
- int dst_halfheight = Half(dst_height);
-
-#ifdef UNDER_ALLOCATED_HACK
- // If caller passed width / 2 for stride, adjust halfwidth to match.
- if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) {
- src_halfwidth = src_width >> 1;
- }
- if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) {
- dst_halfwidth = dst_width >> 1;
- }
- // If caller used height / 2 when computing src_v, it will point into what
- // should be the src_u plane. Detect this and reduce halfheight to match.
- int uv_src_plane_size = src_halfwidth * src_halfheight;
- if ((src_height & 1) &&
- (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
- src_halfheight = src_height >> 1;
- }
- int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
- if ((dst_height & 1) &&
- (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
- dst_halfheight = dst_height >> 1;
- }
-#endif
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
ScalePlane(src_y, src_stride_y, src_width, src_height,
dst_y, dst_stride_y, dst_width, dst_height,
@@ -2362,60 +846,15 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
int dst_stride_y, int dst_stride_u, int dst_stride_v,
int dst_width, int dst_height,
bool interpolate) {
- if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
- !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 ||
- src_width > 32767 || src_height > 32767) {
- return -1;
- }
- // Negative height means invert the image.
- if (src_height < 0) {
- src_height = -src_height;
- int halfheight = Half(src_height);
- src_y = src_y + (src_height - 1) * src_stride_y;
- src_u = src_u + (halfheight - 1) * src_stride_u;
- src_v = src_v + (halfheight - 1) * src_stride_v;
- src_stride_y = -src_stride_y;
- src_stride_u = -src_stride_u;
- src_stride_v = -src_stride_v;
- }
- int src_halfwidth = Half(src_width);
- int src_halfheight = Half(src_height);
- int dst_halfwidth = Half(dst_width);
- int dst_halfheight = Half(dst_height);
- FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
-
-#ifdef UNDER_ALLOCATED_HACK
- // If caller passed width / 2 for stride, adjust halfwidth to match.
- if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) {
- src_halfwidth = src_width >> 1;
- }
- if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) {
- dst_halfwidth = dst_width >> 1;
- }
- // If caller used height / 2 when computing src_v, it will point into what
- // should be the src_u plane. Detect this and reduce halfheight to match.
- int uv_src_plane_size = src_halfwidth * src_halfheight;
- if ((src_height & 1) &&
- (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
- src_halfheight = src_height >> 1;
- }
- int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
- if ((dst_height & 1) &&
- (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
- dst_halfheight = dst_height >> 1;
- }
-#endif
-
- ScalePlane(src_y, src_stride_y, src_width, src_height,
- dst_y, dst_stride_y, dst_width, dst_height,
- filtering);
- ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
- dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
- filtering);
- ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
- dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
- filtering);
- return 0;
+ return I420Scale(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ src_width, src_height,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ dst_width, dst_height,
+ interpolate ? kFilterBox : kFilterNone);
}
// Deprecated api
@@ -2425,15 +864,14 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
bool interpolate) {
if (!src || src_width <= 0 || src_height <= 0 ||
!dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
- src_width > 32767 || src_height > 32767 ||
dst_yoffset >= dst_height) {
return -1;
}
dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2.
- int src_halfwidth = Half(src_width);
- int src_halfheight = Half(src_height);
- int dst_halfwidth = Half(dst_width);
- int dst_halfheight = Half(dst_height);
+ int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
+ int src_halfheight = SUBSAMPLE(src_height, 1, 1);
+ int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
+ int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
int aheight = dst_height - dst_yoffset * 2; // actual output height
const uint8* src_y = src;
const uint8* src_u = src + src_width * src_height;
@@ -2444,9 +882,15 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
(dst_yoffset >> 1) * dst_halfwidth;
uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
(dst_yoffset >> 1) * dst_halfwidth;
- return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
- src_width, src_height, dst_y, dst_u, dst_v, dst_width,
- dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
+ return I420Scale(src_y, src_width,
+ src_u, src_halfwidth,
+ src_v, src_halfwidth,
+ src_width, src_height,
+ dst_y, dst_width,
+ dst_u, dst_halfwidth,
+ dst_v, dst_halfwidth,
+ dst_width, aheight,
+ interpolate ? kFilterBox : kFilterNone);
}
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libyuv/source/scale_argb.cc
index 5cf14d949ef..fb2222e2a23 100644
--- a/chromium/third_party/libyuv/source/scale_argb.cc
+++ b/chromium/third_party/libyuv/source/scale_argb.cc
@@ -16,6 +16,7 @@
#include "libyuv/cpu_id.h"
#include "libyuv/planar_functions.h" // For CopyARGB
#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -26,715 +27,9 @@ static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
-// ARGB scaling uses bilinear or point, but not box filter.
-#if !defined(LIBYUV_DISABLE_NEON) && \
- (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-#define HAS_SCALEARGBROWDOWN2_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width);
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-#define HAS_SCALEARGBROWDOWN2_SSE2
-// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
- ptrdiff_t /* src_stride */,
- uint8* dst_argb, int dst_width) {
- __asm {
- mov eax, [esp + 4] // src_argb
- // src_stride ignored
- mov edx, [esp + 12] // dst_argb
- mov ecx, [esp + 16] // dst_width
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- lea eax, [eax + 32]
- shufps xmm0, xmm1, 0xdd
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- ret
- }
-}
-
-// Blends 8x2 rectangle to 4x1.
-// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- __asm {
- push esi
- mov eax, [esp + 4 + 4] // src_argb
- mov esi, [esp + 4 + 8] // src_stride
- mov edx, [esp + 4 + 12] // dst_argb
- mov ecx, [esp + 4 + 16] // dst_width
-
- align 16
- wloop:
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 16]
- movdqa xmm2, [eax + esi]
- movdqa xmm3, [eax + esi + 16]
- lea eax, [eax + 32]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop esi
- ret
- }
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- __asm {
- push ebx
- push edi
- mov eax, [esp + 8 + 4] // src_argb
- // src_stride ignored
- mov ebx, [esp + 8 + 12] // src_stepx
- mov edx, [esp + 8 + 16] // dst_argb
- mov ecx, [esp + 8 + 20] // dst_width
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
-
- align 16
- wloop:
- movd xmm0, [eax]
- movd xmm1, [eax + ebx]
- punpckldq xmm0, xmm1
- movd xmm2, [eax + ebx * 2]
- movd xmm3, [eax + edi]
- lea eax, [eax + ebx * 4]
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop edi
- pop ebx
- ret
- }
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- __asm {
- push ebx
- push esi
- push edi
- mov eax, [esp + 12 + 4] // src_argb
- mov esi, [esp + 12 + 8] // src_stride
- mov ebx, [esp + 12 + 12] // src_stepx
- mov edx, [esp + 12 + 16] // dst_argb
- mov ecx, [esp + 12 + 20] // dst_width
- lea esi, [eax + esi] // row1 pointer
- lea ebx, [ebx * 4]
- lea edi, [ebx + ebx * 2]
-
- align 16
- wloop:
- movq xmm0, qword ptr [eax] // row0 4 pairs
- movhps xmm0, qword ptr [eax + ebx]
- movq xmm1, qword ptr [eax + ebx * 2]
- movhps xmm1, qword ptr [eax + edi]
- lea eax, [eax + ebx * 4]
- movq xmm2, qword ptr [esi] // row1 4 pairs
- movhps xmm2, qword ptr [esi + ebx]
- movq xmm3, qword ptr [esi + ebx * 2]
- movhps xmm3, qword ptr [esi + edi]
- lea esi, [esi + ebx * 4]
- pavgb xmm0, xmm2 // average rows
- pavgb xmm1, xmm3
- movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
- shufps xmm0, xmm1, 0x88 // even pixels
- shufps xmm2, xmm1, 0xdd // odd pixels
- pavgb xmm0, xmm2
- sub ecx, 4
- movdqa [edx], xmm0
- lea edx, [edx + 16]
- jg wloop
-
- pop edi
- pop esi
- pop ebx
- ret
- }
-}
-
-// Column scaling unfiltered. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-#define HAS_SCALEARGBCOLS_SSE2
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
- movd xmm2, [esp + 8 + 16] // x
- movd xmm3, [esp + 8 + 20] // dx
- pextrw eax, xmm2, 1 // get x0 integer. preroll
- sub ecx, 2
- jl xloop29
-
- movdqa xmm0, xmm2 // x1 = x0 + dx
- paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
-
- // 2 Pixel loop.
- align 16
- xloop2:
- paddd xmm2, xmm3 // x += dx
- movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels
- movd xmm1, qword ptr [esi + edx * 4] // 1 source x1 pixels
- punpckldq xmm0, xmm1 // x0 x1
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
- sub ecx, 2 // 2 pixels
- jge xloop2
- xloop29:
-
- add ecx, 2 - 1
- jl xloop99
-
- // 1 pixel remainder
- movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels
- movd [edi], xmm0
- xloop99:
-
- pop edi
- pop esi
- ret
- }
-}
-
-// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
-// TODO(fbarchard): Port to Neon
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static const uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-static const uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-__declspec(naked) __declspec(align(16))
-static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- __asm {
- push esi
- push edi
- mov edi, [esp + 8 + 4] // dst_argb
- mov esi, [esp + 8 + 8] // src_argb
- mov ecx, [esp + 8 + 12] // dst_width
- movd xmm2, [esp + 8 + 16] // x
- movd xmm3, [esp + 8 + 20] // dx
- movdqa xmm4, kShuffleColARGB
- movdqa xmm5, kShuffleFractions
- pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
- psrlw xmm6, 9
- pextrw eax, xmm2, 1 // get x0 integer. preroll
- sub ecx, 2
- jl xloop29
-
- movdqa xmm0, xmm2 // x1 = x0 + dx
- paddd xmm0, xmm3
- punpckldq xmm2, xmm0 // x0 x1
- punpckldq xmm3, xmm3 // dx dx
- paddd xmm3, xmm3 // dx * 2, dx * 2
- pextrw edx, xmm2, 3 // get x1 integer. preroll
-
- // 2 Pixel loop.
- align 16
- xloop2:
- movdqa xmm1, xmm2 // x0, x1 fractions.
- paddd xmm2, xmm3 // x += dx
- movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- psrlw xmm1, 9 // 7 bit fractions.
- movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
- pshufb xmm1, xmm5 // 0000000011111111
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm1, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
- psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
- pextrw eax, xmm2, 1 // get x0 integer. next iteration.
- pextrw edx, xmm2, 3 // get x1 integer. next iteration.
- packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
- movq qword ptr [edi], xmm0
- lea edi, [edi + 8]
- sub ecx, 2 // 2 pixels
- jge xloop2
- xloop29:
-
- add ecx, 2 - 1
- jl xloop99
-
- // 1 pixel remainder
- psrlw xmm2, 9 // 7 bit fractions.
- movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
- pshufb xmm2, xmm5 // 00000000
- pshufb xmm0, xmm4 // arrange pixels into pairs
- pxor xmm2, xmm6 // 0..7f and 7f..0
- pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
- psrlw xmm0, 7
- packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
- movd [edi], xmm0
- xloop99:
-
- pop edi
- pop esi
- ret
- }
-}
-
-#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-// GCC versions of row functions are verbatim conversions from Visual C.
-// Generated using gcc disassembly on Visual C object file:
-// objdump -D yuvscaler.obj >yuvscaler.txt
-#define HAS_SCALEARGBROWDOWN2_SSE2
-static void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
- ptrdiff_t /* src_stride */,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "lea 0x20(%0),%0 \n"
- "shufps $0xdd,%%xmm1,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1"
-#endif
- );
-}
-
-static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm0 \n"
- "movdqa 0x10(%0),%%xmm1 \n"
- "movdqa (%0,%3,1),%%xmm2 \n"
- "movdqa 0x10(%0,%3,1),%%xmm3 \n"
- "lea 0x20(%0),%0 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%2 \n"
- "movdqa %%xmm0,(%1) \n"
- "lea 0x10(%1),%1 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"(static_cast<intptr_t>(src_stride)) // %3
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-// Reads 4 pixels at a time.
-// Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
- intptr_t src_stepx_x12 = 0;
- asm volatile (
- "lea 0x0(,%1,4),%1 \n"
- "lea (%1,%1,2),%4 \n"
- ".p2align 4 \n"
- "1: \n"
- "movd (%0),%%xmm0 \n"
- "movd (%0,%1,1),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "movd (%0,%1,2),%%xmm2 \n"
- "movd (%0,%4,1),%%xmm3 \n"
- "lea (%0,%1,4),%0 \n"
- "punpckldq %%xmm3,%%xmm2 \n"
- "punpcklqdq %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width), // %3
- "+r"(src_stepx_x12) // %4
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-// Blends four 2x2 to 4x1.
-// Alignment requirement: dst_argb 16 byte aligned.
-static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
- ptrdiff_t src_stride, int src_stepx,
- uint8* dst_argb, int dst_width) {
- intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
- intptr_t src_stepx_x12 = 0;
- intptr_t row1 = static_cast<intptr_t>(src_stride);
- asm volatile (
- "lea 0x0(,%1,4),%1 \n"
- "lea (%1,%1,2),%4 \n"
- "lea (%0,%5,1),%5 \n"
- ".p2align 4 \n"
- "1: \n"
- "movq (%0),%%xmm0 \n"
- "movhps (%0,%1,1),%%xmm0 \n"
- "movq (%0,%1,2),%%xmm1 \n"
- "movhps (%0,%4,1),%%xmm1 \n"
- "lea (%0,%1,4),%0 \n"
- "movq (%5),%%xmm2 \n"
- "movhps (%5,%1,1),%%xmm2 \n"
- "movq (%5,%1,2),%%xmm3 \n"
- "movhps (%5,%4,1),%%xmm3 \n"
- "lea (%5,%1,4),%5 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "pavgb %%xmm3,%%xmm1 \n"
- "movdqa %%xmm0,%%xmm2 \n"
- "shufps $0x88,%%xmm1,%%xmm0 \n"
- "shufps $0xdd,%%xmm1,%%xmm2 \n"
- "pavgb %%xmm2,%%xmm0 \n"
- "sub $0x4,%3 \n"
- "movdqa %%xmm0,(%2) \n"
- "lea 0x10(%2),%2 \n"
- "jg 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stepx_x4), // %1
- "+r"(dst_argb), // %2
- "+rm"(dst_width), // %3
- "+r"(src_stepx_x12), // %4
- "+r"(row1) // %5
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-#define HAS_SCALEARGBCOLS_SSE2
-static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- intptr_t x0 = 0, x1 = 0;
- asm volatile (
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
-
- ".p2align 4 \n"
- "2: \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movd (%1,%3,4),%%xmm0 \n"
- "movd (%1,%4,4),%%xmm1 \n"
- "punpckldq %%xmm1,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "movq %%xmm0,(%0) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
-
- "29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "movd (%1,%3,4),%%xmm0 \n"
- "movd %%xmm0,(%0) \n"
- "99: \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+rm"(dst_width), // %2
- "+r"(x0), // %3
- "+r"(x1) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3"
-#endif
- );
-}
-
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-
-// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-CONST uvec8 kShuffleColARGB = {
- 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
- 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
-};
-
-// Shuffle table for duplicating 2 fractions into 8 bytes each
-CONST uvec8 kShuffleFractions = {
- 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
-};
-
-// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-#define HAS_SCALEARGBFILTERCOLS_SSSE3
-static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- intptr_t x0 = 0, x1 = 0;
- asm volatile (
- "movdqa %0,%%xmm4 \n"
- "movdqa %1,%%xmm5 \n"
- :
- : "m"(kShuffleColARGB), // %0
- "m"(kShuffleFractions) // %1
- );
-
- asm volatile (
- "movd %5,%%xmm2 \n"
- "movd %6,%%xmm3 \n"
- "pcmpeqb %%xmm6,%%xmm6 \n"
- "psrlw $0x9,%%xmm6 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "sub $0x2,%2 \n"
- "jl 29f \n"
- "movdqa %%xmm2,%%xmm0 \n"
- "paddd %%xmm3,%%xmm0 \n"
- "punpckldq %%xmm0,%%xmm2 \n"
- "punpckldq %%xmm3,%%xmm3 \n"
- "paddd %%xmm3,%%xmm3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
-
- ".p2align 4 \n"
- "2: \n"
- "movdqa %%xmm2,%%xmm1 \n"
- "paddd %%xmm3,%%xmm2 \n"
- "movq (%1,%3,4),%%xmm0 \n"
- "psrlw $0x9,%%xmm1 \n"
- "movhps (%1,%4,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm1 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm1 \n"
- "pmaddubsw %%xmm1,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "pextrw $0x1,%%xmm2,%k3 \n"
- "pextrw $0x3,%%xmm2,%k4 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movq %%xmm0,(%0) \n"
- "lea 0x8(%0),%0 \n"
- "sub $0x2,%2 \n"
- "jge 2b \n"
-
- "29: \n"
- "add $0x1,%2 \n"
- "jl 99f \n"
- "psrlw $0x9,%%xmm2 \n"
- "movq (%1,%3,4),%%xmm0 \n"
- "pshufb %%xmm5,%%xmm2 \n"
- "pshufb %%xmm4,%%xmm0 \n"
- "pxor %%xmm6,%%xmm2 \n"
- "pmaddubsw %%xmm2,%%xmm0 \n"
- "psrlw $0x7,%%xmm0 \n"
- "packuswb %%xmm0,%%xmm0 \n"
- "movd %%xmm0,(%0) \n"
- "99: \n"
- : "+r"(dst_argb), // %0
- "+r"(src_argb), // %1
- "+rm"(dst_width), // %2
- "+r"(x0), // %3
- "+r"(x1) // %4
- : "rm"(x), // %5
- "rm"(dx) // %6
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-#endif
- );
-}
-#endif // defined(__x86_64__) || defined(__i386__)
-
-static void ScaleARGBRowDown2_C(const uint8* src_argb,
- ptrdiff_t /* src_stride */,
- uint8* dst_argb, int dst_width) {
- const uint32* src = reinterpret_cast<const uint32*>(src_argb);
- uint32* dst = reinterpret_cast<uint32*>(dst_argb);
-
- for (int x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src[1];
- dst[1] = src[3];
- src += 4;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[1];
- }
-}
-
-static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
- uint8* dst_argb, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- dst_argb[0] = (src_argb[0] + src_argb[4] +
- src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
- dst_argb[1] = (src_argb[1] + src_argb[5] +
- src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
- dst_argb[2] = (src_argb[2] + src_argb[6] +
- src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
- dst_argb[3] = (src_argb[3] + src_argb[7] +
- src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
- src_argb += 8;
- dst_argb += 4;
- }
-}
-
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- const uint32* src = reinterpret_cast<const uint32*>(src_argb);
- uint32* dst = reinterpret_cast<uint32*>(dst_argb);
-
- for (int x = 0; x < dst_width - 1; x += 2) {
- dst[0] = src[0];
- dst[1] = src[src_stepx];
- src += src_stepx * 2;
- dst += 2;
- }
- if (dst_width & 1) {
- dst[0] = src[0];
- }
-}
-
-static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
- ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- for (int x = 0; x < dst_width; ++x) {
- dst_argb[0] = (src_argb[0] + src_argb[4] +
- src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
- dst_argb[1] = (src_argb[1] + src_argb[5] +
- src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
- dst_argb[2] = (src_argb[2] + src_argb[6] +
- src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
- dst_argb[3] = (src_argb[3] + src_argb[7] +
- src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
- src_argb += src_stepx * 4;
- dst_argb += 4;
- }
-}
-
-// Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) static_cast<uint32>( \
- BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
- BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
- BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
-
-static void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- const uint32* src = reinterpret_cast<const uint32*>(src_argb);
- uint32* dst = reinterpret_cast<uint32*>(dst_argb);
- for (int j = 0; j < dst_width - 1; j += 2) {
- int xi = x >> 16;
- int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, xf);
- x += dx;
- xi = x >> 16;
- xf = (x >> 9) & 0x7f;
- a = src[xi];
- b = src[xi + 1];
- dst[1] = BLENDER(a, b, xf);
- x += dx;
- dst += 2;
- }
- if (dst_width & 1) {
- int xi = x >> 16;
- int xf = (x >> 9) & 0x7f;
- uint32 a = src[xi];
- uint32 b = src[xi + 1];
- dst[0] = BLENDER(a, b, xf);
- }
-}
-
// ScaleARGB ARGB, 1/2
// This is an optimized version for scaling down a ARGB to 1/2 of
// its original size.
-
static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
int dst_width, int dst_height,
int src_stride, int dst_stride,
@@ -743,8 +38,8 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
FilterMode filtering) {
assert(dx == 65536 * 2); // Test scale factor of 2.
assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
- // Advance to odd row / even column.
- if (filtering) {
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
} else {
src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
@@ -752,13 +47,16 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
int row_stride = src_stride * (dy >> 16);
void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
uint8* dst_argb, int dst_width) =
- filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C;
+ filtering == kFilterNone ? ScaleARGBRowDown2_C :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+ ScaleARGBRowDown2Box_C);
#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
- ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 :
- ScaleARGBRowDown2_SSE2;
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+ ScaleARGBRowDown2Box_SSE2);
}
#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
@@ -768,7 +66,9 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
}
#endif
- // TODO(fbarchard): Loop through source height to allow odd height.
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
for (int y = 0; y < dst_height; ++y) {
ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
src_argb += row_stride;
@@ -776,6 +76,49 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
}
}
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int /* src_width */, int /* src_height */,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+
+ assert(dst_width * 2 <= kMaxStride);
+ // TODO(fbarchard): Remove clip_src_width alignment checks.
+ SIMD_ALIGNED(uint8 row[kMaxStride * 2 + 16]);
+
+ // Advance to odd row, even column.
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+ }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+ }
+#endif
+
+ for (int y = 0; y < dst_height; ++y) {
+ ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+ ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+ row + kMaxStride, dst_width * 2);
+ ScaleARGBRowDown2(row, kMaxStride, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
// ScaleARGB ARGB Even
// This is an optimized version for scaling down a ARGB to even
// multiple of its original size.
@@ -807,6 +150,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
}
#endif
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
for (int y = 0; y < dst_height; ++y) {
ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
src_argb += row_stride;
@@ -815,11 +161,13 @@ static void ScaleARGBDownEven(int src_width, int src_height,
}
// Scale ARGB down with bilinear interpolation.
+SAFEBUFFERS
static void ScaleARGBBilinearDown(int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy) {
+ int x, int dx, int y, int dy,
+ FilterMode filtering) {
assert(src_height > 0);
assert(dst_width > 0);
assert(dst_height > 0);
@@ -859,6 +207,14 @@ static void ScaleARGBBilinearDown(int src_height,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
InterpolateRow = InterpolateRow_Any_NEON;
@@ -883,27 +239,33 @@ static void ScaleARGBBilinearDown(int src_height,
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
- int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ const int max_y = (src_height - 1) << 16;
for (int j = 0; j < dst_height; ++j) {
- if (y > maxy) {
- y = maxy;
+ if (y > max_y) {
+ y = max_y;
}
int yi = y >> 16;
- int yf = (y >> 8) & 255;
const uint8* src = src_argb + yi * src_stride;
- InterpolateRow(row, src, src_stride, clip_src_width, yf);
- ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ if (filtering == kFilterLinear) {
+ ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ }
dst_argb += dst_stride;
y += dy;
}
}
// Scale ARGB up with bilinear interpolation.
+SAFEBUFFERS
static void ScaleARGBBilinearUp(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy) {
+ int x, int dx, int y, int dy,
+ FilterMode filtering) {
assert(src_width > 0);
assert(src_height > 0);
assert(dst_width > 0);
@@ -934,6 +296,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
}
#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
InterpolateRow = InterpolateRow_Any_NEON;
@@ -949,15 +319,32 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
}
#endif
void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
+ if (filtering && TestCpuFlag(kCpuHasSSSE3)) {
ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
}
#endif
- int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
- if (y > maxy) {
- y = maxy;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2)) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
}
int yi = y >> 16;
const uint8* src = src_argb + yi * src_stride;
@@ -976,7 +363,12 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
for (int j = 0; j < dst_height; ++j) {
yi = y >> 16;
if (yi != lasty) {
- if (y <= maxy) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_argb + yi * src_stride;
+ }
+ if (yi != lasty) {
ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
rowptr += rowstride;
rowstride = -rowstride;
@@ -984,33 +376,205 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
src += src_stride;
}
}
- int yf = (y >> 8) & 255;
- InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
dst_argb += dst_stride;
y += dy;
}
}
-// Scales a single row of pixels using point sampling.
-// Code is adapted from libyuv bilinear yuv scaling, but with bilinear
-// interpolation off, and argb pixels instead of yuv.
-static void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
- int dst_width, int x, int dx) {
- const uint32* src = reinterpret_cast<const uint32*>(src_argb);
- uint32* dst = reinterpret_cast<uint32*>(dst_argb);
- for (int j = 0; j < dst_width - 1; j += 2) {
- dst[0] = src[x >> 16];
- x += dx;
- dst[1] = src[x >> 16];
- x += dx;
- dst += 2;
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+SAFEBUFFERS
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ FilterMode filtering) {
+ assert(src_width > 0);
+ assert(src_height > 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ assert(dst_width * 4 <= kMaxStride);
+
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
}
- if (dst_width & 1) {
- dst[0] = src[x >> 16];
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+#endif
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) = ScaleARGBFilterCols_C;
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3)) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
+ int yi = y >> 16;
+ int uv_yi = yi >> kYShift;
+ const uint8* src_row_y = src_y + yi * src_stride_y;
+ const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+ const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+ SIMD_ALIGNED(uint8 row[2 * kMaxStride]);
+ SIMD_ALIGNED(uint8 argb_row[kMaxStride * 4]);
+ uint8* rowptr = row;
+ int rowstride = kMaxStride;
+ int lasty = yi;
+
+ ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+ if (src_height > 1) {
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+ if (src_height > 2) {
+ src_row_y += src_stride_y;
+ if (!(yi & 1)) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+
+ for (int j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ uv_yi = yi >> kYShift;
+ src_row_y = src_y + yi * src_stride_y;
+ src_row_u = src_u + uv_yi * src_stride_u;
+ src_row_v = src_v + uv_yi * src_stride_v;
+ }
+ if (yi != lasty) {
+ // TODO(fbarchard): Convert the clipped region of row.
+ I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+ ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
}
}
+#endif
-// ScaleARGB ARGB to/from any dimensions, without interpolation.
+// Scale ARGB to/from any dimensions, without interpolation.
// Fixed point math is used for performance: The upper 16 bits
// of x and dx is the integer part of the source position and
// the lower 16 bits are the fixed decimal part.
@@ -1027,6 +591,16 @@ static void ScaleARGBSimple(int src_width, int src_height,
ScaleARGBCols = ScaleARGBCols_SSE2;
}
#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
for (int i = 0; i < dst_height; ++i) {
ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
@@ -1036,33 +610,6 @@ static void ScaleARGBSimple(int src_width, int src_height,
}
}
-// ScaleARGB ARGB to/from any dimensions.
-static void ScaleARGBAnySize(int src_width, int src_height,
- int dst_width, int dst_height,
- int clip_width, int clip_height,
- int src_stride, int dst_stride,
- const uint8* src_argb, uint8* dst_argb,
- int x, int dx, int y, int dy,
- FilterMode filtering) {
- if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
- ScaleARGBBilinearUp(src_width, src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src_argb, dst_argb,
- x, dx, y, dy);
- return;
- }
- if (filtering && src_width * 4 < kMaxStride) {
- ScaleARGBBilinearDown(src_height,
- clip_width, clip_height,
- src_stride, dst_stride, src_argb, dst_argb,
- x, dx, y, dy);
- return;
- }
- ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
- src_stride, dst_stride, src_argb, dst_argb,
- x, dx, y, dy);
-}
-
// ScaleARGB a ARGB.
// This function in turn calls a scaling function
// suitable for handling the desired resolutions.
@@ -1072,6 +619,12 @@ static void ScaleARGB(const uint8* src, int src_stride,
int dst_width, int dst_height,
int clip_x, int clip_y, int clip_width, int clip_height,
FilterMode filtering) {
+ // ARGB does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
+
// Negative src_height means invert the image.
if (src_height < 0) {
src_height = -src_height;
@@ -1079,37 +632,12 @@ static void ScaleARGB(const uint8* src, int src_stride,
src_stride = -src_stride;
}
// Initial source x/y coordinate and step values as 16.16 fixed point.
- int dx = 0;
- int dy = 0;
int x = 0;
int y = 0;
- if (filtering) {
- // Scale step for bilinear sampling renders last pixel once for upsample.
- if (dst_width <= Abs(src_width)) {
- dx = (Abs(src_width) << 16) / dst_width;
- x = (dx >> 1) - 32768;
- } else if (dst_width > 1) {
- dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1);
- }
- if (dst_height <= src_height) {
- dy = (src_height << 16) / dst_height;
- y = (dy >> 1) - 32768;
- } else if (dst_height > 1) {
- dy = ((src_height - 1) << 16) / (dst_height - 1);
- }
- } else {
- // Scale step for point sampling duplicates all pixels equally.
- dx = (Abs(src_width) << 16) / dst_width;
- dy = (src_height << 16) / dst_height;
- x = dx >> 1;
- y = dy >> 1;
- }
- // Negative src_width means horizontally mirror.
- if (src_width < 0) {
- x += (dst_width - 1) * dx;
- dx = -dx;
- src_width = -src_width;
- }
+ int dx = 0;
+ int dy = 0;
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
if (clip_x) {
x += clip_x * dx;
dst += clip_x * 4;
@@ -1121,19 +649,29 @@ static void ScaleARGB(const uint8* src, int src_stride,
// Special case for integer step values.
if (((dx | dy) & 0xffff) == 0) {
- if (!dx || !dy) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
filtering = kFilterNone;
} else {
// Optimized even scale down. ie 2, 4, 6, 8, 10x.
if (!(dx & 0x10000) && !(dy & 0x10000)) {
- if ((dx >> 16) == 2) {
- // Optimized 1/2 horizontal.
- ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleARGBDown2(src_width, src_height,
+ clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
return;
}
- ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleARGBDown4Box(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy);
+ return;
+ }
+ ScaleARGBDownEven(src_width, src_height,
+ clip_width, clip_height,
src_stride, dst_stride, src, dst,
x, dx, y, dy, filtering);
return;
@@ -1141,7 +679,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
// Optimized odd scale down. ie 3, 5, 7, 9x.
if ((dx & 0x10000) && (dy & 0x10000)) {
filtering = kFilterNone;
- if (dst_width == src_width && dst_height == src_height) {
+ if (dx == 0x10000 && dy == 0x10000) {
// Straight copy.
ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
dst, dst_stride, clip_width, clip_height);
@@ -1150,11 +688,31 @@ static void ScaleARGB(const uint8* src, int src_stride,
}
}
}
- // Arbitrary scale up and/or down.
- ScaleARGBAnySize(src_width, src_height,
- dst_width, dst_height,
- clip_width, clip_height,
- src_stride, dst_stride, src, dst, x, dx, y, dy, filtering);
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical(src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, y, dy, 4, filtering);
+ return;
+ }
+ if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) {
+ ScaleARGBBilinearUp(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ if (filtering && src_width * 4 < kMaxStride) {
+ ScaleARGBBilinearDown(src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy);
}
LIBYUV_API
@@ -1167,7 +725,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
if (!src_argb || src_width == 0 || src_height == 0 ||
!dst_argb || dst_width <= 0 || dst_height <= 0 ||
clip_x < 0 || clip_y < 0 ||
- src_width > 32767 || src_height > 32767 ||
(clip_x + clip_width) > dst_width ||
(clip_y + clip_height) > dst_height) {
return -1;
@@ -1186,8 +743,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb,
int dst_width, int dst_height,
FilterMode filtering) {
if (!src_argb || src_width == 0 || src_height == 0 ||
- !dst_argb || dst_width <= 0 || dst_height <= 0 ||
- src_width > 32767 || src_height > 32767) {
+ !dst_argb || dst_width <= 0 || dst_height <= 0) {
return -1;
}
ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
diff --git a/chromium/third_party/libyuv/source/scale_argb_neon.cc b/chromium/third_party/libyuv/source/scale_argb_neon.cc
deleted file mode 100644
index 51b00872441..00000000000
--- a/chromium/third_party/libyuv/source/scale_argb_neon.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC Neon
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
- asm volatile (
- "1: \n"
- // load even pixels into q0, odd into q1
- "vld2.32 {q0, q1}, [%0]! \n"
- "vld2.32 {q2, q3}, [%0]! \n"
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vst1.8 {q1}, [%1]! \n" // store odd pixels
- "vst1.8 {q3}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst), // %1
- "+r"(dst_width) // %2
- :
- : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
- );
-}
-
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
- asm volatile (
- // change the stride to row 2 pointer
- "add %1, %1, %0 \n"
- "1: \n"
- "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
- "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
- "subs %3, %3, #8 \n" // 8 processed per loop.
- "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
- "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
- "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
- "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
- "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
- "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
- "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
- "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
- "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
- "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
- "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
- "vrshrn.u16 d1, q1, #2 \n"
- "vrshrn.u16 d2, q2, #2 \n"
- "vrshrn.u16 d3, q3, #2 \n"
- "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(src_stride), // %1
- "+r"(dst), // %2
- "+r"(dst_width) // %3
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
- );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- "mov r12, %3, lsl #2 \n"
- ".p2align 2 \n"
- "1: \n"
- "vld1.32 {d0[0]}, [%0], r12 \n"
- "vld1.32 {d0[1]}, [%0], r12 \n"
- "vld1.32 {d1[0]}, [%0], r12 \n"
- "vld1.32 {d1[1]}, [%0], r12 \n"
- "subs %2, %2, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%1]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_argb), // %1
- "+r"(dst_width) // %2
- : "r"(src_stepx) // %3
- : "memory", "cc", "r12", "q0"
- );
-}
-
-// Reads 4 pixels at a time.
-// Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
- int src_stepx,
- uint8* dst_argb, int dst_width) {
- asm volatile (
- "mov r12, %4, lsl #2 \n"
- "add %1, %1, %0 \n"
- ".p2align 2 \n"
- "1: \n"
- "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
- "vld1.8 {d1}, [%1], r12 \n"
- "vld1.8 {d2}, [%0], r12 \n"
- "vld1.8 {d3}, [%1], r12 \n"
- "vld1.8 {d4}, [%0], r12 \n"
- "vld1.8 {d5}, [%1], r12 \n"
- "vld1.8 {d6}, [%0], r12 \n"
- "vld1.8 {d7}, [%1], r12 \n"
- "vaddl.u8 q0, d0, d1 \n"
- "vaddl.u8 q1, d2, d3 \n"
- "vaddl.u8 q2, d4, d5 \n"
- "vaddl.u8 q3, d6, d7 \n"
- "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
- "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
- "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
- "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
- "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
- "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
- "subs %3, %3, #4 \n" // 4 pixels per loop.
- "vst1.8 {q0}, [%2]! \n"
- "bgt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(src_stride), // %1
- "+r"(dst_argb), // %2
- "+r"(dst_width) // %3
- : "r"(src_stepx) // %4
- : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
- );
-}
-#endif // __ARM_NEON__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc
new file mode 100644
index 00000000000..ee6a336292c
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_common.cc
@@ -0,0 +1,657 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// CPU agnostic row functions
+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[1];
+ dst[1] = src_ptr[3];
+ dst += 2;
+ src_ptr += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[1];
+ }
+}
+
+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ dst[1] = (s[2] + s[3] + 1) >> 1;
+ dst += 2;
+ s += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + 1) >> 1;
+ }
+}
+
+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
+ dst += 2;
+ s += 4;
+ t += 4;
+ }
+ if (dst_width & 1) {
+ dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
+ }
+}
+
+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src_ptr[2];
+ dst[1] = src_ptr[6];
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = src_ptr[2];
+ }
+}
+
+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ intptr_t stride = src_stride;
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 4] + src_ptr[stride + 5] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
+ src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
+ src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
+ 8) >> 4;
+ dst += 2;
+ src_ptr += 8;
+ }
+ if (dst_width & 1) {
+ dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride + 3] +
+ src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
+ src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
+ src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
+ 8) >> 4;
+ }
+}
+
+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ for (int x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[1];
+ dst[2] = src_ptr[3];
+ dst += 3;
+ src_ptr += 4;
+ }
+}
+
+// Filter rows 0 and 1 together, 3 : 1
+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ for (int x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 * 3 + b0 + 2) >> 2;
+ d[1] = (a1 * 3 + b1 + 2) >> 2;
+ d[2] = (a2 * 3 + b2 + 2) >> 2;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Filter rows 1 and 2 together, 1 : 1
+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* d, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ const uint8* s = src_ptr;
+ const uint8* t = src_ptr + src_stride;
+ for (int x = 0; x < dst_width; x += 3) {
+ uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+ uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+ uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+ uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+ uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+ uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+ d[0] = (a0 + b0 + 1) >> 1;
+ d[1] = (a1 + b1 + 1) >> 1;
+ d[2] = (a2 + b2 + 1) >> 1;
+ d += 3;
+ s += 4;
+ t += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr[1] = src_ptr[x >> 16];
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int, int) {
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ dst_ptr[1] = dst_ptr[0] = src_ptr[0];
+ src_ptr += 1;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ dst_ptr[0] = src_ptr[0];
+ }
+}
+
+// (1-f)a + fb can be replaced with a + f(b-a)
+#define BLENDER(a, b, f) (static_cast<int>(a) + \
+ ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
+
+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ xi = x >> 16;
+ a = src_ptr[xi];
+ b = src_ptr[xi + 1];
+ dst_ptr[1] = BLENDER(a, b, x & 0xffff);
+ x += dx;
+ dst_ptr += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int a = src_ptr[xi];
+ int b = src_ptr[xi + 1];
+ dst_ptr[0] = BLENDER(a, b, x & 0xffff);
+ }
+}
+#undef BLENDER
+
+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ assert(dst_width % 3 == 0);
+ for (int x = 0; x < dst_width; x += 3) {
+ dst[0] = src_ptr[0];
+ dst[1] = src_ptr[3];
+ dst[2] = src_ptr[6];
+ dst += 3;
+ src_ptr += 8;
+ }
+}
+
+// 8x3 -> 3x1
+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ intptr_t stride = src_stride;
+ for (int i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+ src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+ (65536 / 9) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+ src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+ (65536 / 9) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7] +
+ src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+ (65536 / 6) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+// 8x2 -> 3x1
+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ assert((dst_width % 3 == 0) && (dst_width > 0));
+ intptr_t stride = src_stride;
+ for (int i = 0; i < dst_width; i += 3) {
+ dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
+ src_ptr[stride + 0] + src_ptr[stride + 1] +
+ src_ptr[stride + 2]) * (65536 / 6) >> 16;
+ dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
+ src_ptr[stride + 3] + src_ptr[stride + 4] +
+ src_ptr[stride + 5]) * (65536 / 6) >> 16;
+ dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
+ src_ptr[stride + 6] + src_ptr[stride + 7]) *
+ (65536 / 4) >> 16;
+ src_ptr += 8;
+ dst_ptr += 3;
+ }
+}
+
+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ assert(src_width > 0);
+ assert(src_height > 0);
+ for (int x = 0; x < src_width; ++x) {
+ const uint8* s = src_ptr + x;
+ unsigned int sum = 0u;
+ for (int y = 0; y < src_height; ++y) {
+ sum += s[0];
+ s += src_stride;
+ }
+ // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.
+ dst_ptr[x] = sum < 65535u ? sum : 65535u;
+ }
+}
+
+void ScaleARGBRowDown2_C(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[1];
+ dst[1] = src[3];
+ src += 4;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[1];
+ }
+}
+
+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ for (int x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
+ dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
+ dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;
+ dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ for (int x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += 8;
+ dst_argb += 4;
+ }
+}
+
+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+
+ for (int x = 0; x < dst_width - 1; x += 2) {
+ dst[0] = src[0];
+ dst[1] = src[src_stepx];
+ src += src_stepx * 2;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ for (int x = 0; x < dst_width; ++x) {
+ dst_argb[0] = (src_argb[0] + src_argb[4] +
+ src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
+ dst_argb[1] = (src_argb[1] + src_argb[5] +
+ src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
+ dst_argb[2] = (src_argb[2] + src_argb[6] +
+ src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
+ dst_argb[3] = (src_argb[3] + src_argb[7] +
+ src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+ src_argb += src_stepx * 4;
+ dst_argb += 4;
+ }
+}
+
+// Scales a single row of pixels using point sampling.
+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ dst[0] = src[x >> 16];
+ x += dx;
+ dst[1] = src[x >> 16];
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[x >> 16];
+ }
+}
+
+// Scales a single row of pixels up by 2x using point sampling.
+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int, int) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ dst[1] = dst[0] = src[0];
+ src += 1;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ dst[0] = src[0];
+ }
+}
+
+// Mimics SSSE3 blender
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
+#define BLENDERC(a, b, f, s) static_cast<uint32>( \
+ BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f) \
+ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
+ BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+
+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ const uint32* src = reinterpret_cast<const uint32*>(src_argb);
+ uint32* dst = reinterpret_cast<uint32*>(dst_argb);
+ for (int j = 0; j < dst_width - 1; j += 2) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ x += dx;
+ xi = x >> 16;
+ xf = (x >> 9) & 0x7f;
+ a = src[xi];
+ b = src[xi + 1];
+ dst[1] = BLENDER(a, b, xf);
+ x += dx;
+ dst += 2;
+ }
+ if (dst_width & 1) {
+ int xi = x >> 16;
+ int xf = (x >> 9) & 0x7f;
+ uint32 a = src[xi];
+ uint32 b = src[xi + 1];
+ dst[0] = BLENDER(a, b, xf);
+ }
+}
+#undef BLENDER1
+#undef BLENDERC
+#undef BLENDER
+
+// Scale plane vertically with bilinear interpolation.
+void ScalePlaneVertical(int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int y, int dy,
+ int bpp, FilterMode filtering) {
+ // TODO(fbarchard): Allow higher bpp.
+ assert(bpp >= 1 && bpp <= 4);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ int dst_width_bytes = dst_width * bpp;
+ src_argb += (x >> 16) * bpp;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width_bytes, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width_bytes, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(dst_width_bytes, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+ const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
+ for (int j = 0; j < dst_height; ++j) {
+ if (y > max_y) {
+ y = max_y;
+ }
+ int yi = y >> 16;
+ int yf = filtering ? ((y >> 8) & 255) : 0;
+ const uint8* src = src_argb + yi * src_stride;
+ InterpolateRow(dst_argb, src, src_stride, dst_width_bytes, yf);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// Simplify the filtering based on scale factors.
+FilterMode ScaleFilterReduce(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode filtering) {
+ if (src_width < 0) {
+ src_width = -src_width;
+ }
+ if (src_height < 0) {
+ src_height = -src_height;
+ }
+ if (filtering == kFilterBox) {
+ // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.
+ if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {
+ filtering = kFilterBilinear;
+ }
+ // If scaling to larger, switch from Box to Bilinear.
+ if (dst_width >= src_width || dst_height >= src_height) {
+ filtering = kFilterBilinear;
+ }
+ }
+ if (filtering == kFilterBilinear) {
+ if (src_height == 1) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.
+ if (dst_height == src_height || dst_height * 3 == src_height) {
+ filtering = kFilterLinear;
+ }
+ // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to
+ // avoid reading 2 pixels horizontally that causes memory exception.
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ if (src_width == 1) {
+ filtering = kFilterNone;
+ }
+ // TODO(fbarchard): Detect any odd scale factor and reduce to None.
+ if (dst_width == src_width || dst_width * 3 == src_width) {
+ filtering = kFilterNone;
+ }
+ }
+ return filtering;
+}
+
+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
+#define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \
+ (dst << 16) - 0x00010000);
+
+// Compute slope values for stepping.
+void ScaleSlope(int src_width, int src_height,
+ int dst_width, int dst_height,
+ FilterMode filtering,
+ int* x, int* y, int* dx, int* dy) {
+ assert(x != NULL);
+ assert(y != NULL);
+ assert(dx != NULL);
+ assert(dy != NULL);
+ assert(src_width != 0);
+ assert(src_height != 0);
+ assert(dst_width > 0);
+ assert(dst_height > 0);
+ if (filtering == kFilterBox) {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = 0;
+ *y = 0;
+ } else if (filtering == kFilterBilinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FIXEDDIV1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ if (dst_height <= src_height) {
+ *dy = FixedDiv(src_height, dst_height);
+ *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_height > 1) {
+ *dy = FIXEDDIV1(src_height, dst_height);
+ *y = 0;
+ }
+ } else if (filtering == kFilterLinear) {
+ // Scale step for bilinear sampling renders last pixel once for upsample.
+ if (dst_width <= Abs(src_width)) {
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter.
+ } else if (dst_width > 1) {
+ *dx = FIXEDDIV1(Abs(src_width), dst_width);
+ *x = 0;
+ }
+ *dy = FixedDiv(src_height, dst_height);
+ *y = *dy >> 1;
+ } else {
+ // Scale step for point sampling duplicates all pixels equally.
+ *dx = FixedDiv(Abs(src_width), dst_width);
+ *dy = FixedDiv(src_height, dst_height);
+ *x = CENTERSTART(*dx, 0);
+ *y = CENTERSTART(*dy, 0);
+ }
+ // Negative src_width means horizontally mirror.
+ if (src_width < 0) {
+ *x += (dst_width - 1) * *dx;
+ *dx = -*dx;
+ src_width = -src_width;
+ }
+}
+#undef CENTERSTART
+#undef FIXEDDIV1
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_mips.cc b/chromium/third_party/libyuv/source/scale_mips.cc
index cfd48b5b053..de94560959e 100644
--- a/chromium/third_party/libyuv/source/scale_mips.cc
+++ b/chromium/third_party/libyuv/source/scale_mips.cc
@@ -30,6 +30,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"beqz $t9, 2f \n"
" nop \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -88,6 +89,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"bltz $t9, 2f \n"
" nop \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -176,7 +178,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
}
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
- uint8* dst, int dst_width) {
+ uint8* dst, int dst_width) {
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
@@ -185,6 +187,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
"beqz $t9, 2f \n"
" nop \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -231,7 +234,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
}
void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
- uint8* dst, int dst_width) {
+ uint8* dst, int dst_width) {
intptr_t stride = src_stride;
const uint8* s1 = src_ptr + stride;
const uint8* s2 = s1 + stride;
@@ -244,6 +247,7 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
"srl $t9, %[dst_width], 1 \n"
"andi $t8, %[dst_width], 1 \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
@@ -314,6 +318,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
+ ".p2align 2 \n"
"1: \n"
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -360,7 +365,9 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
- "repl.ph $t3, 3 \n" // 0x00030003
+ "repl.ph $t3, 3 \n" // 0x00030003
+
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@@ -416,6 +423,8 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
".set push \n"
".set noreorder \n"
"repl.ph $t2, 3 \n" // 0x00030003
+
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
@@ -466,6 +475,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
+
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
@@ -515,6 +526,8 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
+
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
@@ -571,6 +584,8 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
__asm__ __volatile__ (
".set push \n"
".set noreorder \n"
+
+ ".p2align 2 \n"
"1: \n"
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc
index a370349a72f..c9c6b2cdf88 100644
--- a/chromium/third_party/libyuv/source/scale_neon.cc
+++ b/chromium/third_party/libyuv/source/scale_neon.cc
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "libyuv/basic_types.h"
#include "libyuv/row.h"
#ifdef __cplusplus
@@ -16,7 +15,7 @@ namespace libyuv {
extern "C" {
#endif
-// This module is for GCC Neon
+// This module is for GCC Neon.
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
// NEON downscalers with interpolation.
@@ -25,6 +24,7 @@ extern "C" {
void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n"
@@ -44,6 +44,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
// change the stride to row 2 pointer
"add %1, %0 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
@@ -68,11 +69,12 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #8 \n" // 8 processed per loop
- "vst1.8 {d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -87,6 +89,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add r4, %0, %3 \n"
"add r5, r4, %3 \n"
"add %3, r5, %3 \n"
+ ".p2align 2 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [r4]! \n"
@@ -117,12 +120,13 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ ".p2align 2 \n"
"1: \n"
- "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
- "subs %2, %2, #24 \n"
- "vmov d2, d3 \n" // order d0, d1, d2
- "vst3.8 {d0, d1, d2}, [%1]! \n"
- "bgt 1b \n"
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ "subs %2, %2, #24 \n"
+ "vmov d2, d3 \n" // order d0, d1, d2
+ "vst3.8 {d0, d1, d2}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -135,8 +139,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
@@ -191,8 +196,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vmov.u8 d24, #3 \n"
- "add %3, %0 \n"
+ "vmov.u8 d24, #3 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
@@ -226,14 +232,14 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
}
#define HAS_SCALEROWDOWN38_NEON
-const uvec8 kShuf38 =
+static uvec8 kShuf38 =
{ 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-const uvec8 kShuf38_2 =
+static uvec8 kShuf38_2 =
{ 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-const vec16 kMult38_Div6 =
+static vec16 kMult38_Div6 =
{ 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-const vec16 kMult38_Div9 =
+static vec16 kMult38_Div9 =
{ 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
@@ -242,15 +248,16 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t /* src_stride */,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vld1.8 {q3}, [%3] \n"
+ "vld1.8 {q3}, [%3] \n"
+ ".p2align 2 \n"
"1: \n"
- "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
- "subs %2, %2, #12 \n"
- "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
- "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
- "vst1.8 {d4}, [%1]! \n"
- "vst1.32 {d5[0]}, [%1]! \n"
- "bgt 1b \n"
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
+ "subs %2, %2, #12 \n"
+ "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
+ "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ "vst1.8 {d4}, [%1]! \n"
+ "vst1.32 {d5[0]}, [%1]! \n"
+ "bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@@ -264,11 +271,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "vld1.8 {q15}, [%6] \n"
- "add r4, %0, %3, lsl #1 \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "vld1.8 {q15}, [%6] \n"
+ "add r4, %0, %3, lsl #1 \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
@@ -374,9 +382,10 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "add %3, %0 \n"
+ "vld1.16 {q13}, [%4] \n"
+ "vld1.8 {q14}, [%5] \n"
+ "add %3, %0 \n"
+ ".p2align 2 \n"
"1: \n"
// d0 = 00 40 01 41 02 42 03 43
@@ -546,6 +555,125 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
: "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
);
}
+
+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ // load even pixels into q0, odd into q1
+ "vld2.32 {q0, q1}, [%0]! \n"
+ "vld2.32 {q2, q3}, [%0]! \n"
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vst1.8 {q1}, [%1]! \n" // store odd pixels
+ "vst1.8 {q3}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+
+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst, int dst_width) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
+ "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
+ "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
+ "vrshrn.u16 d1, q1, #2 \n"
+ "vrshrn.u16 d2, q2, #2 \n"
+ "vrshrn.u16 d3, q3, #2 \n"
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst), // %2
+ "+r"(dst_width) // %3
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %3, lsl #2 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.32 {d0[0]}, [%0], r12 \n"
+ "vld1.32 {d0[1]}, [%0], r12 \n"
+ "vld1.32 {d1[0]}, [%0], r12 \n"
+ "vld1.32 {d1[1]}, [%0], r12 \n"
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(src_stepx) // %3
+ : "memory", "cc", "r12", "q0"
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: src_argb 4 byte aligned.
+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ "mov r12, %4, lsl #2 \n"
+ "add %1, %1, %0 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ "vld1.8 {d1}, [%1], r12 \n"
+ "vld1.8 {d2}, [%0], r12 \n"
+ "vld1.8 {d3}, [%1], r12 \n"
+ "vld1.8 {d4}, [%0], r12 \n"
+ "vld1.8 {d5}, [%1], r12 \n"
+ "vld1.8 {d6}, [%0], r12 \n"
+ "vld1.8 {d7}, [%1], r12 \n"
+ "vaddl.u8 q0, d0, d1 \n"
+ "vaddl.u8 q1, d2, d3 \n"
+ "vaddl.u8 q2, d4, d5 \n"
+ "vaddl.u8 q3, d6, d7 \n"
+ "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
+ "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
+ "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
+ "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
+ "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
+ "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
+ "subs %3, %3, #4 \n" // 4 pixels per loop.
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width) // %3
+ : "r"(src_stepx) // %4
+ : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
+ );
+}
+
#endif // __ARM_NEON__
#ifdef __cplusplus
diff --git a/chromium/third_party/libyuv/source/scale_posix.cc b/chromium/third_party/libyuv/source/scale_posix.cc
new file mode 100644
index 00000000000..a777bfde1cc
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_posix.cc
@@ -0,0 +1,1337 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC x86 and x64.
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// TODO(nfullagar): For Native Client: When new toolchain becomes available,
+// take advantage of bundle lock / unlock feature. This will reduce the amount
+// of manual bundle alignment done below, and bundle alignment could even be
+// moved into each macro that doesn't use %%nacl: such as MEMOPREG.
+
+#if defined(__native_client__) && defined(__x86_64__)
+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
+#define MEMLEA(offset, base) #offset "(%q" #base ")"
+#define MEMLEA3(offset, index, scale) \
+ #offset "(,%q" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%q" #base ",%q" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " (%%r15,%%r14),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " %%" #reg ",(%%r15,%%r14)\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+ "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
+ #opcode " (%%r15,%%r14)"
+#define BUNDLEALIGN ".p2align 5\n"
+#else
+#define MEMACCESS(base) "(%" #base ")"
+#define MEMACCESS2(offset, base) #offset "(%" #base ")"
+#define MEMLEA(offset, base) #offset "(%" #base ")"
+#define MEMLEA3(offset, index, scale) \
+ #offset "(,%" #index "," #scale ")"
+#define MEMLEA4(offset, base, index, scale) \
+ #offset "(%" #base ",%" #index "," #scale ")"
+#define MEMOPREG(opcode, offset, base, index, scale, reg) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
+ #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
+#define MEMOP(opcode, offset, base, index, scale) \
+ #opcode " " #offset "(%" #base ",%" #index "," #scale ")"
+#define BUNDLEALIGN
+#endif
+
+// GCC versions of row functions are verbatim conversions from Visual C.
+// Generated using gcc disassembly on Visual C object file:
+// objdump -D yuvscaler.obj >yuvscaler.txt
+
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrlw $0x8,%%xmm5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm5,%%xmm2 \n"
+ "pand %%xmm5,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqu %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "pcmpeqb %%xmm5,%%xmm5 \n"
+ "psrld $0x18,%%xmm5 \n"
+ "pslld $0x10,%%xmm5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pand %%xmm5,%%xmm0 \n"
+ "pand %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ intptr_t stridex3 = 0;
+ asm volatile (
+ "pcmpeqb %%xmm7,%%xmm7 \n"
+ "psrlw $0x8,%%xmm7 \n"
+ "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
+ MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
+ MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm4,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm5,%%xmm3 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psrlw $0x8,%%xmm1 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pand %%xmm7,%%xmm3 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "pavgw %%xmm3,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "psrlw $0x8,%%xmm0 \n"
+ "pand %%xmm7,%%xmm2 \n"
+ "pavgw %%xmm2,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x8,1) ",%1 \n"
+ "sub $0x8,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(stridex3) // %3
+ : "r"(static_cast<intptr_t>(src_stride)) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
+#endif
+ );
+}
+
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm3 \n"
+ "movdqa %1,%%xmm4 \n"
+ "movdqa %2,%%xmm5 \n"
+ :
+ : "m"(kShuf0), // %0
+ "m"(kShuf1), // %1
+ "m"(kShuf2) // %2
+ );
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "palignr $0x8,%%xmm0,%%xmm1 \n"
+ "pshufb %%xmm3,%%xmm0 \n"
+ "pshufb %%xmm4,%%xmm1 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n" // kShuf01
+ "movdqa %1,%%xmm3 \n" // kShuf11
+ "movdqa %2,%%xmm4 \n" // kShuf21
+ :
+ : "m"(kShuf01), // %0
+ "m"(kShuf11), // %1
+ "m"(kShuf21) // %2
+ );
+ asm volatile (
+ "movdqa %0,%%xmm5 \n" // kMadd01
+ "movdqa %1,%%xmm0 \n" // kMadd11
+ "movdqa %2,%%xmm1 \n" // kRound34
+ :
+ : "m"(kMadd01), // %0
+ "m"(kMadd11), // %1
+ "m"(kRound34) // %2
+ );
+
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm6 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "pmaddubsw %%xmm5,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS(1) " \n"
+ "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
+ MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "pmaddubsw %%xmm0,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
+ MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm6,%%xmm7 \n"
+ "pavgb %%xmm7,%%xmm6 \n"
+ "pshufb %%xmm4,%%xmm6 \n"
+ "pmaddubsw %4,%%xmm6 \n"
+ "paddsw %%xmm1,%%xmm6 \n"
+ "psrlw $0x2,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x18,1) ",%1 \n"
+ "sub $0x18,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)), // %3
+ "m"(kMadd21) // %4
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %3,%%xmm4 \n"
+ "movdqa %4,%%xmm5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "paddusb %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(1) " \n"
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
+ "lea " MEMLEA(0xc,1) ",%1 \n"
+ "sub $0xc,%2 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "m"(kShuf38a), // %3
+ "m"(kShuf38b) // %4
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm4", "xmm5"
+#endif
+ );
+}
+
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "movdqa %3,%%xmm5 \n"
+ :
+ : "m"(kShufAb0), // %0
+ "m"(kShufAb1), // %1
+ "m"(kShufAb2), // %2
+ "m"(kScaleAb2) // %3
+ );
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "pshufb %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm3,%%xmm6 \n"
+ "paddusw %%xmm6,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm1 \n"
+ "pmulhuw %%xmm5,%%xmm1 \n"
+ "packuswb %%xmm1,%%xmm1 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm1," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm1 \n"
+ "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ asm volatile (
+ "movdqa %0,%%xmm2 \n"
+ "movdqa %1,%%xmm3 \n"
+ "movdqa %2,%%xmm4 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ :
+ : "m"(kShufAc), // %0
+ "m"(kShufAc3), // %1
+ "m"(kScaleAc33) // %2
+ );
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
+ "movhlps %%xmm0,%%xmm1 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm0 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
+ "lea " MEMLEA(0x10,0) ",%0 \n"
+ "movhlps %%xmm6,%%xmm7 \n"
+ "punpcklbw %%xmm5,%%xmm6 \n"
+ "punpcklbw %%xmm5,%%xmm7 \n"
+ "paddusw %%xmm6,%%xmm0 \n"
+ "paddusw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "psrldq $0x2,%%xmm0 \n"
+ "paddusw %%xmm0,%%xmm6 \n"
+ "pshufb %%xmm2,%%xmm6 \n"
+ "movdqa %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "psrldq $0x2,%%xmm1 \n"
+ "paddusw %%xmm1,%%xmm7 \n"
+ "pshufb %%xmm3,%%xmm7 \n"
+ "paddusw %%xmm7,%%xmm6 \n"
+ "pmulhuw %%xmm4,%%xmm6 \n"
+ "packuswb %%xmm6,%%xmm6 \n"
+ "sub $0x6,%2 \n"
+ "movd %%xmm6," MEMACCESS(1) " \n"
+ "psrlq $0x10,%%xmm6 \n"
+ "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
+ "lea " MEMLEA(0x6,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ );
+}
+
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width, int src_height) {
+ int tmp_height = 0;
+ intptr_t tmp_src = 0;
+ asm volatile (
+ "pxor %%xmm4,%%xmm4 \n"
+ "sub $0x1,%5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "mov %0,%3 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm4,%%xmm0 \n"
+ "punpckhbw %%xmm4,%%xmm1 \n"
+ "mov %5,%2 \n"
+ "test %2,%2 \n"
+ "je 3f \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "2: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm2 \n"
+ "add %6,%0 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklbw %%xmm4,%%xmm2 \n"
+ "punpckhbw %%xmm4,%%xmm3 \n"
+ "paddusw %%xmm2,%%xmm0 \n"
+ "paddusw %%xmm3,%%xmm1 \n"
+ "sub $0x1,%2 \n"
+ "jg 2b \n"
+ ".p2align 2 \n"
+ "3: \n"
+ BUNDLEALIGN
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
+ "lea " MEMLEA(0x10,3) ",%0 \n"
+ "lea " MEMLEA(0x20,1) ",%1 \n"
+ "sub $0x10,%4 \n"
+ "jg 1b \n"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(tmp_height), // %2
+ "+r"(tmp_src), // %3
+ "+r"(src_width), // %4
+ "+rm"(src_height) // %5
+ : "rm"(static_cast<intptr_t>(src_stride)) // %6
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+
+// Bilinear column filtering. SSSE3 version.
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
+ asm volatile (
+ "movd %6,%%xmm2 \n"
+ "movd %7,%%xmm3 \n"
+ "movl $0x04040000,%k2 \n"
+ "movd %k2,%%xmm5 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "subl $0x2,%5 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOP(movzwl,0x00,1,3,1) ",%k2 \n" // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOP(movzwl,0x00,1,4,1) ",%k2 \n" // movzwl (%1,%4,1),%k2
+ "movd %k2,%%xmm4 \n"
+ "pshufb %%xmm5,%%xmm1 \n"
+ "punpcklwd %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k2 \n"
+ "mov %w2," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x2,0) ",%0 \n"
+ "sub $0x2,%5 \n"
+ "jge 2b \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "29: \n"
+ "addl $0x1,%5 \n"
+ "jl 99f \n"
+ MEMOP(movzwl,0x00,1,3,1) ",%k2 \n" // movzwl (%1,%3,1),%k2
+ "movd %k2,%%xmm0 \n"
+ "psrlw $0x9,%%xmm2 \n"
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0,%k2 \n"
+ "mov %b2," MEMACCESS(0) " \n"
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+a"(temp_pixel), // %2
+ "+r"(x0), // %3
+ "+r"(x1), // %4
+ "+rm"(dst_width) // %5
+ : "rm"(x), // %6
+ "rm"(dx) // %7
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int /* x */, int /* dx */) {
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpcklbw %%xmm0,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm1 \n"
+ "sub $0x20,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm0 \n"
+ "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
+ MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(1) " \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(dst_width) // %2
+ : "r"(static_cast<intptr_t>(src_stride)) // %3
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movd " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
+ "punpckldq %%xmm1,%%xmm0 \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
+ MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "punpckldq %%xmm3,%%xmm2 \n"
+ "punpcklqdq %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+r"(dst_width), // %3
+ "+r"(src_stepx_x12) // %4
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride, int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx);
+ intptr_t src_stepx_x12 = 0;
+ intptr_t row1 = static_cast<intptr_t>(src_stride);
+ asm volatile (
+ "lea " MEMLEA3(0x00,1,4) ",%1 \n"
+ "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
+ "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movq " MEMACCESS(0) ",%%xmm0 \n"
+ MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
+ MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
+ BUNDLEALIGN
+ MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
+ "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
+ "movq " MEMACCESS(5) ",%%xmm2 \n"
+ BUNDLEALIGN
+ MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
+ MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
+ MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
+ "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "pavgb %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "shufps $0x88,%%xmm1,%%xmm0 \n"
+ "shufps $0xdd,%%xmm1,%%xmm2 \n"
+ "pavgb %%xmm2,%%xmm0 \n"
+ "sub $0x4,%3 \n"
+ "movdqa %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jg 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stepx_x4), // %1
+ "+r"(dst_argb), // %2
+ "+rm"(dst_width), // %3
+ "+r"(src_stepx_x12), // %4
+ "+r"(row1) // %5
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3"
+#endif
+ );
+}
+
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0;
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pshufd $0x0,%%xmm2,%%xmm2 \n"
+ "pshufd $0x11,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x5,%%xmm3,%%xmm0 \n"
+ "paddd %%xmm0,%%xmm2 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pshufd $0x0,%%xmm3,%%xmm3 \n"
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "cmp $0x0,%4 \n"
+ "jl 99f \n"
+ "sub $0x4,%4 \n"
+ "jl 49f \n"
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "40: \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "pextrw $0x7,%%xmm2,%k1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
+ MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
+ "pextrw $0x1,%%xmm2,%k0 \n"
+ "pextrw $0x3,%%xmm2,%k1 \n"
+ "punpckldq %%xmm4,%%xmm1 \n"
+ "punpcklqdq %%xmm1,%%xmm0 \n"
+ "sub $0x4,%4 \n"
+ "movdqu %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x10,2) ",%2 \n"
+ "jge 40b \n"
+
+ "49: \n"
+ "test $0x2,%4 \n"
+ "je 29f \n"
+ BUNDLEALIGN
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
+ "pextrw $0x5,%%xmm2,%k0 \n"
+ "punpckldq %%xmm1,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(2) " \n"
+ "lea " MEMLEA(0x8,2) ",%2 \n"
+ "29: \n"
+ "test $0x1,%4 \n"
+ "je 99f \n"
+ MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
+ "movd %%xmm0," MEMACCESS(2) " \n"
+ "99: \n"
+ : "+a"(x0), // %0
+ "+d"(x1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(src_argb), // %3
+ "+r"(dst_width) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+ );
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int /* x */, int /* dx */) {
+ asm volatile (
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(1) ",%%xmm0 \n"
+ "lea " MEMLEA(0x10,1) ",%1 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "punpckldq %%xmm0,%%xmm0 \n"
+ "punpckhdq %%xmm1,%%xmm1 \n"
+ "sub $0x8,%2 \n"
+ "movdqa %%xmm0," MEMACCESS(0) " \n"
+ "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
+ "lea " MEMLEA(0x20,0) ",%0 \n"
+ "jg 1b \n"
+
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+r"(dst_width) // %2
+ :
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1"
+#endif
+ );
+}
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ intptr_t x0 = 0, x1 = 0;
+ asm volatile (
+ "movdqa %0,%%xmm4 \n"
+ "movdqa %1,%%xmm5 \n"
+ :
+ : "m"(kShuffleColARGB), // %0
+ "m"(kShuffleFractions) // %1
+ );
+
+ asm volatile (
+ "movd %5,%%xmm2 \n"
+ "movd %6,%%xmm3 \n"
+ "pcmpeqb %%xmm6,%%xmm6 \n"
+ "psrlw $0x9,%%xmm6 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "sub $0x2,%2 \n"
+ "jl 29f \n"
+ "movdqa %%xmm2,%%xmm0 \n"
+ "paddd %%xmm3,%%xmm0 \n"
+ "punpckldq %%xmm0,%%xmm2 \n"
+ "punpckldq %%xmm3,%%xmm3 \n"
+ "paddd %%xmm3,%%xmm3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "2: \n"
+ "movdqa %%xmm2,%%xmm1 \n"
+ "paddd %%xmm3,%%xmm2 \n"
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "psrlw $0x9,%%xmm1 \n"
+ BUNDLEALIGN
+ MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
+ "pshufb %%xmm5,%%xmm1 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm1 \n"
+ "pmaddubsw %%xmm1,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "pextrw $0x1,%%xmm2,%k3 \n"
+ "pextrw $0x3,%%xmm2,%k4 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movq %%xmm0," MEMACCESS(0) " \n"
+ "lea " MEMLEA(0x8,0) ",%0 \n"
+ "sub $0x2,%2 \n"
+ "jge 2b \n"
+
+ ".p2align 2 \n"
+ BUNDLEALIGN
+ "29: \n"
+ "add $0x1,%2 \n"
+ "jl 99f \n"
+ "psrlw $0x9,%%xmm2 \n"
+ BUNDLEALIGN
+ MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
+ "pshufb %%xmm5,%%xmm2 \n"
+ "pshufb %%xmm4,%%xmm0 \n"
+ "pxor %%xmm6,%%xmm2 \n"
+ "pmaddubsw %%xmm2,%%xmm0 \n"
+ "psrlw $0x7,%%xmm0 \n"
+ "packuswb %%xmm0,%%xmm0 \n"
+ "movd %%xmm0," MEMACCESS(0) " \n"
+
+ ".p2align 2 \n"
+ "99: \n"
+ : "+r"(dst_argb), // %0
+ "+r"(src_argb), // %1
+ "+rm"(dst_width), // %2
+ "+r"(x0), // %3
+ "+r"(x1) // %4
+ : "rm"(x), // %5
+ "rm"(dx) // %6
+ : "memory", "cc"
+#if defined(__native_client__) && defined(__x86_64__)
+ , "r14"
+#endif
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+#endif
+ );
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/source/scale_win.cc b/chromium/third_party/libyuv/source/scale_win.cc
new file mode 100644
index 00000000000..76f5f4b4b4f
--- /dev/null
+++ b/chromium/third_party/libyuv/source/scale_win.cc
@@ -0,0 +1,1289 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for Visual C x86.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+// Offsets for source bytes 0 to 9
+static uvec8 kShuf0 =
+ { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
+static uvec8 kShuf1 =
+ { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf2 =
+ { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Offsets for source bytes 0 to 10
+static uvec8 kShuf01 =
+ { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+
+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
+static uvec8 kShuf11 =
+ { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+
+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
+static uvec8 kShuf21 =
+ { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+
+// Coefficients for source bytes 0 to 10
+static uvec8 kMadd01 =
+ { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+
+// Coefficients for source bytes 10 to 21
+static uvec8 kMadd11 =
+ { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+
+// Coefficients for source bytes 21 to 31
+static uvec8 kMadd21 =
+ { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+
+// Coefficients for source bytes 21 to 31
+static vec16 kRound34 =
+ { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static uvec8 kShuf38a =
+ { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+static uvec8 kShuf38b =
+ { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 0,1,2
+static uvec8 kShufAc =
+ { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+
+// Arrange words 0,3,6 into 3,4,5
+static uvec8 kShufAc3 =
+ { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x3 and 2x3
+static uvec16 kScaleAc33 =
+ { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+
+// Arrange first value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb0 =
+ { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+
+// Arrange second value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb1 =
+ { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+
+// Arrange third value for pixels 0,1,2,3,4,5
+static uvec8 kShufAb2 =
+ { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+
+// Scaling values for boxes of 3x2 and 2x2
+static uvec16 kScaleAb2 =
+ { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 32 pixels, throws half away and writes 16 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ psrlw xmm0, 8 // isolate odd pixels.
+ psrlw xmm1, 8
+ packuswb xmm0, xmm1
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x1 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ lea eax, [eax + 32]
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 16x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
+ psrlw xmm5, 8
+
+ align 4
+ wloop:
+ movdqu xmm0, [eax]
+ movdqu xmm1, [eax + 16]
+ movdqu xmm2, [eax + esi]
+ movdqu xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm5
+ pand xmm3, xmm5
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ sub ecx, 16
+ movdqu [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Point samples 32 pixels to 8 pixels.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
+ psrld xmm5, 24
+ pslld xmm5, 16
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ pand xmm0, xmm5
+ pand xmm1, xmm5
+ packuswb xmm0, xmm1
+ psrlw xmm0, 8
+ packuswb xmm0, xmm0
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x4 rectangle to 8x1.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ push edi
+ mov eax, [esp + 8 + 4] // src_ptr
+ mov esi, [esp + 8 + 8] // src_stride
+ mov edx, [esp + 8 + 12] // dst_ptr
+ mov ecx, [esp + 8 + 16] // dst_width
+ lea edi, [esi + esi * 2] // src_stride * 3
+ pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
+ psrlw xmm7, 8
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, [eax + esi * 2]
+ movdqa xmm3, [eax + esi * 2 + 16]
+ movdqa xmm4, [eax + edi]
+ movdqa xmm5, [eax + edi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm2, xmm4
+ pavgb xmm3, xmm5
+ pavgb xmm0, xmm2
+ pavgb xmm1, xmm3
+
+ movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
+ psrlw xmm0, 8
+ movdqa xmm3, xmm1
+ psrlw xmm1, 8
+ pand xmm2, xmm7
+ pand xmm3, xmm7
+ pavgw xmm0, xmm2
+ pavgw xmm1, xmm3
+ packuswb xmm0, xmm1
+
+ movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
+ psrlw xmm0, 8
+ pand xmm2, xmm7
+ pavgw xmm0, xmm2
+ packuswb xmm0, xmm0
+
+ sub ecx, 8
+ movq qword ptr [edx], xmm0
+ lea edx, [edx + 8]
+ jg wloop
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Point samples 32 pixels to 24 pixels.
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm3, kShuf0
+ movdqa xmm4, kShuf1
+ movdqa xmm5, kShuf2
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm1
+ palignr xmm1, xmm0, 8
+ pshufb xmm0, xmm3
+ pshufb xmm1, xmm4
+ pshufb xmm2, xmm5
+ movq qword ptr [edx], xmm0
+ movq qword ptr [edx + 8], xmm1
+ movq qword ptr [edx + 16], xmm2
+ lea edx, [edx + 24]
+ sub ecx, 24
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 32x2 rectangle to 24x1
+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
+// Then shuffled to do the scaling.
+
+// Register usage:
+// xmm0 src_row 0
+// xmm1 src_row 1
+// xmm2 shuf 0
+// xmm3 shuf 1
+// xmm4 shuf 2
+// xmm5 madd 0
+// xmm6 madd 1
+// xmm7 kRound34
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ sub ecx, 24
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx + 24]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Note that movdqa+palign may be better than movdqu.
+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShuf01
+ movdqa xmm3, kShuf11
+ movdqa xmm4, kShuf21
+ movdqa xmm5, kMadd01
+ movdqa xmm6, kMadd11
+ movdqa xmm7, kRound34
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax] // pixels 0..7
+ movdqa xmm1, [eax + esi]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm2
+ pmaddubsw xmm0, xmm5
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx], xmm0
+ movdqu xmm0, [eax + 8] // pixels 8..15
+ movdqu xmm1, [eax + esi + 8]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm3
+ pmaddubsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ movq qword ptr [edx + 8], xmm0
+ movdqa xmm0, [eax + 16] // pixels 16..23
+ movdqa xmm1, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm1, xmm0
+ pavgb xmm0, xmm1
+ pshufb xmm0, xmm4
+ movdqa xmm1, kMadd21
+ pmaddubsw xmm0, xmm1
+ paddsw xmm0, xmm7
+ psrlw xmm0, 2
+ packuswb xmm0, xmm0
+ sub ecx, 24
+ movq qword ptr [edx + 16], xmm0
+ lea edx, [edx+24]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// 3/8 point sampler
+
+// Scale 32 pixels to 12
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_ptr
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_ptr
+ mov ecx, [esp + 16] // dst_width
+ movdqa xmm4, kShuf38a
+ movdqa xmm5, kShuf38b
+
+ align 4
+ xloop:
+ movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
+ movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
+ lea eax, [eax + 32]
+ pshufb xmm0, xmm4
+ pshufb xmm1, xmm5
+ paddusb xmm0, xmm1
+
+ sub ecx, 12
+ movq qword ptr [edx], xmm0 // write 12 pixels
+ movhlps xmm1, xmm0
+ movd [edx + 8], xmm1
+ lea edx, [edx + 12]
+ jg xloop
+
+ ret
+ }
+}
+
+// Scale 16x3 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAc
+ movdqa xmm3, kShufAc3
+ movdqa xmm4, kScaleAc33
+ pxor xmm5, xmm5
+
+ align 4
+ xloop:
+ movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
+ movdqa xmm6, [eax + esi]
+ movhlps xmm1, xmm0
+ movhlps xmm7, xmm6
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+ movdqa xmm6, [eax + esi * 2]
+ lea eax, [eax + 16]
+ movhlps xmm7, xmm6
+ punpcklbw xmm6, xmm5
+ punpcklbw xmm7, xmm5
+ paddusw xmm0, xmm6
+ paddusw xmm1, xmm7
+
+ movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ psrldq xmm0, 2
+ paddusw xmm6, xmm0
+ pshufb xmm6, xmm2
+
+ movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ psrldq xmm1, 2
+ paddusw xmm7, xmm1
+ pshufb xmm7, xmm3
+ paddusw xmm6, xmm7
+
+ pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
+ packuswb xmm6, xmm6
+
+ sub ecx, 6
+ movd [edx], xmm6 // write 6 pixels
+ psrlq xmm6, 16
+ movd [edx + 2], xmm6
+ lea edx, [edx + 6]
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Scale 16x2 pixels to 6x1 with interpolation
+__declspec(naked) __declspec(align(16))
+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+ ptrdiff_t src_stride,
+ uint8* dst_ptr, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_ptr
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_ptr
+ mov ecx, [esp + 4 + 16] // dst_width
+ movdqa xmm2, kShufAb0
+ movdqa xmm3, kShufAb1
+ movdqa xmm4, kShufAb2
+ movdqa xmm5, kScaleAb2
+
+ align 4
+ xloop:
+ movdqa xmm0, [eax] // average 2 rows into xmm0
+ pavgb xmm0, [eax + esi]
+ lea eax, [eax + 16]
+
+ movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
+ pshufb xmm1, xmm2
+ movdqa xmm6, xmm0
+ pshufb xmm6, xmm3
+ paddusw xmm1, xmm6
+ pshufb xmm0, xmm4
+ paddusw xmm1, xmm0
+
+ pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
+ packuswb xmm1, xmm1
+
+ sub ecx, 6
+ movd [edx], xmm1 // write 6 pixels
+ psrlq xmm1, 16
+ movd [edx + 2], xmm1
+ lea edx, [edx + 6]
+ jg xloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 16xN bytes and produces 16 shorts at a time.
+// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
+__declspec(naked) __declspec(align(16))
+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
+ uint16* dst_ptr, int src_width,
+ int src_height) {
+ __asm {
+ push esi
+ push edi
+ push ebx
+ push ebp
+ mov esi, [esp + 16 + 4] // src_ptr
+ mov edx, [esp + 16 + 8] // src_stride
+ mov edi, [esp + 16 + 12] // dst_ptr
+ mov ecx, [esp + 16 + 16] // dst_width
+ mov ebx, [esp + 16 + 20] // height
+ pxor xmm4, xmm4
+ dec ebx
+
+ align 4
+ xloop:
+ // first row
+ movdqa xmm0, [esi]
+ lea eax, [esi + edx]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm4
+ punpckhbw xmm1, xmm4
+ lea esi, [esi + 16]
+ mov ebp, ebx
+ test ebp, ebp
+ je ydone
+
+ // sum remaining rows
+ align 4
+ yloop:
+ movdqa xmm2, [eax] // read 16 pixels
+ lea eax, [eax + edx] // advance to next row
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm4
+ punpckhbw xmm3, xmm4
+ paddusw xmm0, xmm2 // sum 16 words
+ paddusw xmm1, xmm3
+ sub ebp, 1
+ jg yloop
+
+ align 4
+ ydone:
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm1
+ lea edi, [edi + 32]
+
+ sub ecx, 16
+ jg xloop
+
+ pop ebp
+ pop ebx
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Bilinear column filtering. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+// TODO(fbarchard): Switch the following:
+// xor ebx, ebx
+// mov bx, word ptr [esi + eax] // 2 source x0 pixels
+// To
+// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+// when drmemory bug fixed.
+// https://code.google.com/p/drmemory/issues/detail?id=1396
+
+__declspec(naked) __declspec(align(16))
+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int x, int dx) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov edi, [esp + 12 + 4] // dst_ptr
+ mov esi, [esp + 12 + 8] // src_ptr
+ mov ecx, [esp + 12 + 12] // dst_width
+ movd xmm2, [esp + 12 + 16] // x
+ movd xmm3, [esp + 12 + 20] // dx
+ mov eax, 0x04040000 // shuffle to line up fractions with pixel.
+ movd xmm5, eax
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ align 4
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm1, 9 // 7 bit fractions.
+ movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
+ movd xmm4, ebx
+ pshufb xmm1, xmm5 // 0011
+ punpcklwd xmm0, xmm4
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // 8 bits, 2 pixels.
+ movd ebx, xmm0
+ mov [edi], bx
+ lea edi, [edi + 2]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ align 4
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
+ movd xmm0, ebx
+ psrlw xmm2, 9 // 7 bit fractions.
+ pshufb xmm2, xmm5 // 0011
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // 16 bit
+ psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // 8 bits
+ movd ebx, xmm0
+ mov [edi], bl
+
+ align 4
+ xloop99:
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Reads 16 pixels, duplicates them and writes 32 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
+ int dst_width, int /* x */, int /* dx */) {
+ __asm {
+ mov edx, [esp + 4] // dst_ptr
+ mov eax, [esp + 8] // src_ptr
+ mov ecx, [esp + 12] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm0
+ punpckhbw xmm1, xmm1
+ sub ecx, 32
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+
+ ret
+ }
+}
+
+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ shufps xmm0, xmm1, 0xdd
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x1 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+ ptrdiff_t /* src_stride */,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ mov eax, [esp + 4] // src_argb
+ // src_stride ignored
+ mov edx, [esp + 12] // dst_argb
+ mov ecx, [esp + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ lea eax, [eax + 32]
+ movdqa xmm2, xmm0
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ ret
+ }
+}
+
+// Blends 8x2 rectangle to 4x1.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push esi
+ mov eax, [esp + 4 + 4] // src_argb
+ mov esi, [esp + 4 + 8] // src_stride
+ mov edx, [esp + 4 + 12] // dst_argb
+ mov ecx, [esp + 4 + 16] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + 16]
+ movdqa xmm2, [eax + esi]
+ movdqa xmm3, [eax + esi + 16]
+ lea eax, [eax + 32]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels at a time.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push edi
+ mov eax, [esp + 8 + 4] // src_argb
+ // src_stride ignored
+ mov ebx, [esp + 8 + 12] // src_stepx
+ mov edx, [esp + 8 + 16] // dst_argb
+ mov ecx, [esp + 8 + 20] // dst_width
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 4
+ wloop:
+ movd xmm0, [eax]
+ movd xmm1, [eax + ebx]
+ punpckldq xmm0, xmm1
+ movd xmm2, [eax + ebx * 2]
+ movd xmm3, [eax + edi]
+ lea eax, [eax + ebx * 4]
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop ebx
+ ret
+ }
+}
+
+// Blends four 2x2 to 4x1.
+// Alignment requirement: dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
+ ptrdiff_t src_stride,
+ int src_stepx,
+ uint8* dst_argb, int dst_width) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp + 12 + 4] // src_argb
+ mov esi, [esp + 12 + 8] // src_stride
+ mov ebx, [esp + 12 + 12] // src_stepx
+ mov edx, [esp + 12 + 16] // dst_argb
+ mov ecx, [esp + 12 + 20] // dst_width
+ lea esi, [eax + esi] // row1 pointer
+ lea ebx, [ebx * 4]
+ lea edi, [ebx + ebx * 2]
+
+ align 4
+ wloop:
+ movq xmm0, qword ptr [eax] // row0 4 pairs
+ movhps xmm0, qword ptr [eax + ebx]
+ movq xmm1, qword ptr [eax + ebx * 2]
+ movhps xmm1, qword ptr [eax + edi]
+ lea eax, [eax + ebx * 4]
+ movq xmm2, qword ptr [esi] // row1 4 pairs
+ movhps xmm2, qword ptr [esi + ebx]
+ movq xmm3, qword ptr [esi + ebx * 2]
+ movhps xmm3, qword ptr [esi + edi]
+ lea esi, [esi + ebx * 4]
+ pavgb xmm0, xmm2 // average rows
+ pavgb xmm1, xmm3
+ movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
+ shufps xmm0, xmm1, 0x88 // even pixels
+ shufps xmm2, xmm1, 0xdd // odd pixels
+ pavgb xmm0, xmm2
+ sub ecx, 4
+ movdqa [edx], xmm0
+ lea edx, [edx + 16]
+ jg wloop
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+
+// Column scaling unfiltered. SSE2 version.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push edi
+ push esi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+
+ pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
+ pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
+ paddd xmm2, xmm0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 2
+ pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
+ paddd xmm2, xmm0 // x3 x2 x1 x0
+ paddd xmm3, xmm3 // 0, 0, 0, dx * 4
+ pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
+
+ pextrw eax, xmm2, 1 // get x0 integer.
+ pextrw edx, xmm2, 3 // get x1 integer.
+
+ cmp ecx, 0
+ jle xloop99
+ sub ecx, 4
+ jl xloop49
+
+ // 4 Pixel loop.
+ align 4
+ xloop4:
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ pextrw edx, xmm2, 7 // get x3 integer.
+ paddd xmm2, xmm3 // x += dx
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movd xmm1, [esi + eax * 4] // 1 source x2 pixels
+ movd xmm4, [esi + edx * 4] // 1 source x3 pixels
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ punpckldq xmm1, xmm4 // x2 x3
+ punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
+ sub ecx, 4 // 4 pixels
+ movdqu [edi], xmm0
+ lea edi, [edi + 16]
+ jge xloop4
+
+ align 4
+ xloop49:
+ test ecx, 2
+ je xloop29
+
+ // 2 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x0 pixels
+ movd xmm1, [esi + edx * 4] // 1 source x1 pixels
+ pextrw eax, xmm2, 5 // get x2 integer.
+ punpckldq xmm0, xmm1 // x0 x1
+
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+
+ xloop29:
+ test ecx, 1
+ je xloop99
+
+ // 1 Pixels.
+ movd xmm0, [esi + eax * 4] // 1 source x2 pixels
+ movd dword ptr [edi], xmm0
+ align 4
+ xloop99:
+
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
+// TODO(fbarchard): Port to Neon
+
+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
+static uvec8 kShuffleColARGB = {
+ 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
+ 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
+};
+
+// Shuffle table for duplicating 2 fractions into 8 bytes each
+static uvec8 kShuffleFractions = {
+ 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+};
+
+__declspec(naked) __declspec(align(16))
+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) {
+ __asm {
+ push esi
+ push edi
+ mov edi, [esp + 8 + 4] // dst_argb
+ mov esi, [esp + 8 + 8] // src_argb
+ mov ecx, [esp + 8 + 12] // dst_width
+ movd xmm2, [esp + 8 + 16] // x
+ movd xmm3, [esp + 8 + 20] // dx
+ movdqa xmm4, kShuffleColARGB
+ movdqa xmm5, kShuffleFractions
+ pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
+ psrlw xmm6, 9
+ pextrw eax, xmm2, 1 // get x0 integer. preroll
+ sub ecx, 2
+ jl xloop29
+
+ movdqa xmm0, xmm2 // x1 = x0 + dx
+ paddd xmm0, xmm3
+ punpckldq xmm2, xmm0 // x0 x1
+ punpckldq xmm3, xmm3 // dx dx
+ paddd xmm3, xmm3 // dx * 2, dx * 2
+ pextrw edx, xmm2, 3 // get x1 integer. preroll
+
+ // 2 Pixel loop.
+ align 4
+ xloop2:
+ movdqa xmm1, xmm2 // x0, x1 fractions.
+ paddd xmm2, xmm3 // x += dx
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ psrlw xmm1, 9 // 7 bit fractions.
+ movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
+ pshufb xmm1, xmm5 // 0000000011111111
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm1, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
+ pextrw eax, xmm2, 1 // get x0 integer. next iteration.
+ pextrw edx, xmm2, 3 // get x1 integer. next iteration.
+ psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
+ packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
+ movq qword ptr [edi], xmm0
+ lea edi, [edi + 8]
+ sub ecx, 2 // 2 pixels
+ jge xloop2
+
+ align 4
+ xloop29:
+
+ add ecx, 2 - 1
+ jl xloop99
+
+ // 1 pixel remainder
+ psrlw xmm2, 9 // 7 bit fractions.
+ movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
+ pshufb xmm2, xmm5 // 00000000
+ pshufb xmm0, xmm4 // arrange pixels into pairs
+ pxor xmm2, xmm6 // 0..7f and 7f..0
+ pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
+ psrlw xmm0, 7
+ packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
+ movd [edi], xmm0
+
+ align 4
+ xloop99:
+
+ pop edi
+ pop esi
+ ret
+ }
+}
+
+// Reads 4 pixels, duplicates them and writes 8 pixels.
+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
+__declspec(naked) __declspec(align(16))
+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int /* x */, int /* dx */) {
+ __asm {
+ mov edx, [esp + 4] // dst_argb
+ mov eax, [esp + 8] // src_argb
+ mov ecx, [esp + 12] // dst_width
+
+ align 4
+ wloop:
+ movdqa xmm0, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm1, xmm0
+ punpckldq xmm0, xmm0
+ punpckhdq xmm1, xmm1
+ sub ecx, 8
+ movdqa [edx], xmm0
+ movdqa [edx + 16], xmm1
+ lea edx, [edx + 32]
+ jg wloop
+
+ ret
+ }
+}
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
index f93e97bb71f..1b912b8ba82 100755
--- a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
+++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py
@@ -73,9 +73,10 @@ class LibyuvTest(chrome_tests.ChromeTests):
def main(_):
parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>')
parser.disable_interspersed_args()
- parser.add_option('-b', '--build_dir',
+ parser.add_option('-b', '--build-dir',
help=('Location of the compiler output. Can only be used '
'when the test argument does not contain this path.'))
+ parser.add_option("--target", help="Debug or Release")
parser.add_option('-t', '--test', help='Test to run.')
parser.add_option('', '--baseline', action='store_true', default=False,
help='Generate baseline data instead of validating')
@@ -104,6 +105,11 @@ def main(_):
if not options.test:
parser.error('--test not specified')
+ # Support build dir both with and without the target.
+ if (options.target and options.build_dir and
+ not options.build_dir.endswith(options.target)):
+ options.build_dir = os.path.join(options.build_dir, options.target)
+
# If --build_dir is provided, prepend it to the test executable if needed.
test_executable = options.test
if options.build_dir and not test_executable.startswith(options.build_dir):
diff --git a/chromium/third_party/libyuv/unit_test/compare_test.cc b/chromium/third_party/libyuv/unit_test/compare_test.cc
index 7fe6c3b0b19..efc2e39e68f 100644
--- a/chromium/third_party/libyuv/unit_test/compare_test.cc
+++ b/chromium/third_party/libyuv/unit_test/compare_test.cc
@@ -39,7 +39,7 @@ TEST_F(libyuvTest, Djb2_Test) {
" and feels as if he were in the seventh heaven of typography"
" together with Hermann Zapf";
uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381);
- const uint32 kExpectedFoxHash = 2611006483;
+ const uint32 kExpectedFoxHash = 2611006483u;
EXPECT_EQ(kExpectedFoxHash, foxhash);
for (int i = 0; i < kMaxTest; ++i) {
@@ -286,9 +286,9 @@ TEST_F(libyuvTest, Psnr) {
src_b + kSrcStride * b + b, kSrcStride,
kSrcWidth, kSrcHeight);
- EXPECT_GT(err, 4.0);
+ EXPECT_GT(err, 2.0);
if (kSrcWidth * kSrcHeight >= 256) {
- EXPECT_LT(err, 5.0);
+ EXPECT_LT(err, 6.0);
}
srandom(time(NULL));
@@ -322,7 +322,7 @@ TEST_F(libyuvTest, Psnr) {
free_aligned_buffer_64(src_b)
}
-TEST_F(libyuvTest, BenchmarkSsim_Opt) {
+TEST_F(libyuvTest, DISABLED_BenchmarkSsim_Opt) {
align_buffer_64(src_a, benchmark_width_ * benchmark_height_)
align_buffer_64(src_b, benchmark_width_ * benchmark_height_)
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc
index 7e96c63a4d5..d5eaca0569b 100644
--- a/chromium/third_party/libyuv/unit_test/convert_test.cc
+++ b/chromium/third_party/libyuv/unit_test/convert_test.cc
@@ -1,990 +1,997 @@
-/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <time.h>
-
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-#include "libyuv/convert_from.h"
-#include "libyuv/convert_from_argb.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "../unit_test/unit_test.h"
-
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#else // __GNUC__
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#endif
-
-namespace libyuv {
-
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
-
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_u, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(src_v, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- } \
- } \
- MaskCpuFlags(0); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_u_c) \
- free_aligned_buffer_64(dst_v_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_u_opt) \
- free_aligned_buffer_64(dst_v_opt) \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_u) \
- free_aligned_buffer_64(src_v) \
-}
-
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
-TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
-TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
-TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
-TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
-TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
-TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
-TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
-
-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_u, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(src_v, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- } \
- } \
- MaskCpuFlags(0); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_uv_opt, \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_uv_opt[i * \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_uv_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_uv_opt) \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_u) \
- free_aligned_buffer_64(src_v) \
-}
-
-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
-TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
-
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- } \
- } \
- MaskCpuFlags(0); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_u_c) \
- free_aligned_buffer_64(dst_v_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_u_opt) \
- free_aligned_buffer_64(dst_v_opt) \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_uv) \
-}
-
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
-TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
-
-#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = ((kWidth * BPP_B + ALIGN - 1) / ALIGN) * ALIGN; \
- const int kSizeUV = \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_u, kSizeUV + OFF); \
- align_buffer_64(src_v, kSizeUV + OFF); \
- align_buffer_64(dst_argb_c, kStrideB * kHeight); \
- align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
- memset(dst_argb_c, 0, kStrideB * kHeight); \
- memset(dst_argb_opt, 0, kStrideB * kHeight); \
- srandom(time(NULL)); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (random() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (random() & 0xff); \
- src_v[i + OFF] = (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_argb_c, kStrideB, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_argb_opt, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_64(dst_argb32_c, kWidth * BPP_C * kHeight); \
- align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_c, 0, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_opt, 0, kWidth * BPP_C * kHeight); \
- FMT_B##To##FMT_C(dst_argb_c, kStrideB, \
- dst_argb32_c, kWidth * BPP_C , \
- kWidth, kHeight); \
- FMT_B##To##FMT_C(dst_argb_opt, kStrideB, \
- dst_argb32_opt, kWidth * BPP_C , \
- kWidth, kHeight); \
- for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i]) - \
- static_cast<int>(dst_argb32_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_u) \
- free_aligned_buffer_64(src_v) \
- free_aligned_buffer_64(dst_argb_c) \
- free_aligned_buffer_64(dst_argb_opt) \
- free_aligned_buffer_64(dst_argb32_c) \
- free_aligned_buffer_64(dst_argb32_opt) \
-}
-
-#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- DIFF, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
-
-TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 17, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 0, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 0, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 0, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 1, 2, ARGB, 4)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = kWidth * BPP_B; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_uv, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
- align_buffer_64(dst_argb_c, kStrideB * kHeight); \
- align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
- src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] = \
- (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- dst_argb_c, kWidth * BPP_B, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- dst_argb_opt, kWidth * BPP_B, \
- kWidth, NEG kHeight); \
- } \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \
- align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \
- memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \
- memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \
- FMT_B##ToARGB(dst_argb_c, kStrideB, \
- dst_argb32_c, kWidth * 4, \
- kWidth, kHeight); \
- FMT_B##ToARGB(dst_argb_opt, kStrideB, \
- dst_argb32_opt, kWidth * 4, \
- kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth * 4; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
- static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_uv) \
- free_aligned_buffer_64(dst_argb_c) \
- free_aligned_buffer_64(dst_argb_opt) \
- free_aligned_buffer_64(dst_argb32_c) \
- free_aligned_buffer_64(dst_argb32_opt) \
-}
-
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
-TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
-
-#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
- align_buffer_64(src_argb, kStride * kHeight + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_c, 1, kWidth * kHeight); \
- memset(dst_u_c, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 2, kWidth * kHeight); \
- memset(dst_u_opt, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_u_c) \
- free_aligned_buffer_64(dst_v_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_u_opt) \
- free_aligned_buffer_64(dst_v_opt) \
- free_aligned_buffer_64(src_argb) \
-}
-
-#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, DIFF) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTATOPLANAR(ARGB, 4, I420, 2, 2, 4)
-#ifdef __arm__
-TESTATOPLANAR(ARGB, 4, J420, 2, 2, 4)
-#else
-TESTATOPLANAR(ARGB, 4, J420, 2, 2, 0)
-#endif
-TESTATOPLANAR(BGRA, 4, I420, 2, 2, 4)
-TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4)
-TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4)
-TESTATOPLANAR(RAW, 3, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, I420, 2, 2, 4)
-TESTATOPLANAR(RGB565, 2, I420, 2, 2, 5)
-// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
-TESTATOPLANAR(ARGB1555, 2, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, I420, 2, 2, 17)
-TESTATOPLANAR(ARGB, 4, I411, 4, 1, 4)
-TESTATOPLANAR(ARGB, 4, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, I444, 1, 1, 2)
-TESTATOPLANAR(YUY2, 2, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, I422, 2, 1, 2)
-TESTATOPLANAR(UYVY, 2, I422, 2, 1, 2)
-TESTATOPLANAR(I400, 1, I420, 2, 2, 2)
-TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2, 4)
-
-#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
- align_buffer_64(src_argb, kStride * kHeight + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_uv_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_uv_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_opt, kWidth, \
- dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
- static_cast<int>(dst_uv_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_uv_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_uv_opt) \
- free_aligned_buffer_64(src_argb) \
-}
-
-#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
-
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_64(src_argb, kStrideA * kHeight + OFF); \
- align_buffer_64(dst_argb_c, kStrideB * kHeight); \
- align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
- memset(dst_argb_c, 0, kStrideB * kHeight); \
- memset(dst_argb_opt, 0, kStrideB * kHeight); \
- srandom(time(NULL)); \
- for (int i = 0; i < kStrideA * kHeight; ++i) { \
- src_argb[i + OFF] = (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
- dst_argb_c, kStrideB, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
- dst_argb_opt, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeight; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(src_argb) \
- free_aligned_buffer_64(dst_argb_c) \
- free_aligned_buffer_64(dst_argb_opt) \
-}
-
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
- srandom(time(NULL)); \
- for (int times = 0; times < benchmark_iterations_; ++times) { \
- const int kWidth = (random() & 63) + 1; \
- const int kHeight = (random() & 31) + 1; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
- align_buffer_page_end(src_argb, kStrideA * kHeightA); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
- memset(dst_argb_c, 0, kStrideB * kHeightB); \
- memset(dst_argb_opt, 0, kStrideB * kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i] = (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_B(src_argb, kStrideA, \
- dst_argb_c, kStrideB, \
- kWidth, kHeight); \
- MaskCpuFlags(-1); \
- FMT_A##To##FMT_B(src_argb, kStrideA, \
- dst_argb_opt, kStrideB, \
- kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb) \
- free_aligned_buffer_page_end(dst_argb_c) \
- free_aligned_buffer_page_end(dst_argb_opt) \
- } \
-}
-
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_, DIFF, _Opt, +, 0) \
- TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
-
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
-TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
-
-TEST_F(libyuvTest, Test565) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
- SIMD_ALIGNED(uint8 pixels565[256][2]);
-
- for (int i = 0; i < 256; ++i) {
- for (int j = 0; j < 4; ++j) {
- orig_pixels[i][j] = i;
- }
- }
- ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
- uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
- EXPECT_EQ(610919429u, checksum);
-}
-
-#ifdef HAVE_JPEG
-TEST_F(libyuvTest, ValidateJpeg) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_64(orig_pixels, kSize);
-
- // No SOI or EOI. Expect fail.
- memset(orig_pixels, 0, kSize);
-
- // EOI, SOI. Expect pass.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
- for (int times = 0; times < benchmark_iterations_; ++times) {
- EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
- }
- free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(libyuvTest, InvalidateJpeg) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_64(orig_pixels, kSize);
-
- // No SOI or EOI. Expect fail.
- memset(orig_pixels, 0, kSize);
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- // SOI but no EOI. Expect fail.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- for (int times = 0; times < benchmark_iterations_; ++times) {
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
- }
- // EOI but no SOI. Expect fail.
- orig_pixels[0] = 0;
- orig_pixels[1] = 0;
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- free_aligned_buffer_page_end(orig_pixels);
-}
-
-#endif
-
-} // namespace libyuv
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "libyuv/compare.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
+#include "libyuv/convert_from.h"
+#include "libyuv/convert_from_argb.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "../unit_test/unit_test.h"
+
+#if defined(_MSC_VER)
+#define SIMD_ALIGNED(var) __declspec(align(16)) var
+#else // __GNUC__
+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
+#endif
+
+namespace libyuv {
+
+#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
+
+#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_u, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(src_v, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_u_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_u_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ } \
+ MaskCpuFlags(0); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 0); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_u_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_u_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_v_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_v_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 3); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_u_c) \
+ free_aligned_buffer_64(dst_v_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_u_opt) \
+ free_aligned_buffer_64(dst_v_opt) \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_u) \
+ free_aligned_buffer_64(src_v) \
+}
+
+#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
+TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
+TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
+TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
+TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
+TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
+TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
+TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
+TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
+
+#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_u, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(src_v, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ } \
+ MaskCpuFlags(0); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_c, kWidth, \
+ dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_u + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ src_v + OFF, \
+ SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_opt, kWidth, \
+ dst_uv_opt, \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_uv_c[i * \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_uv_opt[i * \
+ SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_uv_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_uv_opt) \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_u) \
+ free_aligned_buffer_64(src_v) \
+}
+
+#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
+TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
+
+#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_u_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_u_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
+ src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ } \
+ MaskCpuFlags(0); \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
+ src_uv + OFF, \
+ 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
+ dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_u_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_u_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_v_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_v_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 1); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_u_c) \
+ free_aligned_buffer_64(dst_v_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_u_opt) \
+ free_aligned_buffer_64(dst_v_opt) \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_uv) \
+}
+
+#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
+ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
+TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
+
+#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN))
+
+#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \
+ const int kSizeUV = \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_u, kSizeUV + OFF); \
+ align_buffer_64(src_v, kSizeUV + OFF); \
+ align_buffer_64(dst_argb_c, kStrideB * kHeight); \
+ align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
+ memset(dst_argb_c, 0, kStrideB * kHeight); \
+ memset(dst_argb_opt, 0, kStrideB * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kWidth * kHeight; ++i) { \
+ src_y[i + OFF] = (random() & 0xff); \
+ } \
+ for (int i = 0; i < kSizeUV; ++i) { \
+ src_u[i + OFF] = (random() & 0xff); \
+ src_v[i + OFF] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_argb_c, kStrideB, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_argb_opt, kStrideB, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_64(dst_argb32_c, kWidth * BPP_C * kHeight); \
+ align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \
+ memset(dst_argb32_c, 0, kWidth * BPP_C * kHeight); \
+ memset(dst_argb32_opt, 0, kWidth * BPP_C * kHeight); \
+ FMT_B##To##FMT_C(dst_argb_c, kStrideB, \
+ dst_argb32_c, kWidth * BPP_C , \
+ kWidth, kHeight); \
+ FMT_B##To##FMT_C(dst_argb_opt, kStrideB, \
+ dst_argb32_opt, kWidth * BPP_C , \
+ kWidth, kHeight); \
+ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb32_c[i]) - \
+ static_cast<int>(dst_argb32_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_u) \
+ free_aligned_buffer_64(src_v) \
+ free_aligned_buffer_64(dst_argb_c) \
+ free_aligned_buffer_64(dst_argb_opt) \
+ free_aligned_buffer_64(dst_argb32_c) \
+ free_aligned_buffer_64(dst_argb32_opt) \
+}
+
+#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, DIFF, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
+ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
+ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
+
+// TODO(fbarchard): Make vertical alignment unnecessary on bayer.
+TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 2, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 2, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 2, 2, 2, ARGB, 4)
+TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 2, 2, 2, ARGB, 4)
+
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ W1280, DIFF, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStrideB = kWidth * BPP_B; \
+ align_buffer_64(src_y, kWidth * kHeight + OFF); \
+ align_buffer_64(src_uv, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
+ align_buffer_64(dst_argb_c, kStrideB * kHeight); \
+ align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kWidth; ++j) \
+ src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
+ src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] = \
+ (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ dst_argb_c, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
+ src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ dst_argb_opt, kWidth * BPP_B, \
+ kWidth, NEG kHeight); \
+ } \
+ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
+ align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \
+ align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \
+ memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \
+ memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \
+ FMT_B##ToARGB(dst_argb_c, kStrideB, \
+ dst_argb32_c, kWidth * 4, \
+ kWidth, kHeight); \
+ FMT_B##ToARGB(dst_argb_opt, kStrideB, \
+ dst_argb32_opt, kWidth * 4, \
+ kWidth, kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth * 4; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
+ static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(src_y) \
+ free_aligned_buffer_64(src_uv) \
+ free_aligned_buffer_64(dst_argb_c) \
+ free_aligned_buffer_64(dst_argb_opt) \
+ free_aligned_buffer_64(dst_argb32_c) \
+ free_aligned_buffer_64(dst_argb32_opt) \
+}
+
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
+ benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
+
+#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, DIFF, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \
+ const int kStride = \
+ (SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMP_X * 8 * BPP_A + 7) / 8; \
+ align_buffer_64(src_argb, kStride * kHeight + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_u_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_u_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_v_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_c, 1, kWidth * kHeight); \
+ memset(dst_u_c, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_c, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_y_opt, 2, kWidth * kHeight); \
+ memset(dst_u_opt, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ memset(dst_v_opt, 0, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_c, kWidth, \
+ dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_opt, kWidth, \
+ dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_u_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_u_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_v_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
+ static_cast<int>(dst_v_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_u_c) \
+ free_aligned_buffer_64(dst_v_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_u_opt) \
+ free_aligned_buffer_64(dst_v_opt) \
+ free_aligned_buffer_64(src_argb) \
+}
+
+#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ DIFF) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, DIFF, _Opt, +, 0)
+
+TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
+#ifdef __arm__
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4)
+#else
+TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0)
+#endif
+TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
+// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+TESTATOPLANAR(ARGB, 4, 1, I411, 4, 1, 4)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
+TESTATOPLANAR(BayerBGGR, 1, 2, I420, 2, 2, 4)
+TESTATOPLANAR(BayerRGGB, 1, 2, I420, 2, 2, 4)
+TESTATOPLANAR(BayerGBRG, 1, 2, I420, 2, 2, 4)
+TESTATOPLANAR(BayerGRBG, 1, 2, I420, 2, 2, 4)
+
+#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ W1280, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
+ align_buffer_64(src_argb, kStride * kHeight + OFF); \
+ align_buffer_64(dst_y_c, kWidth * kHeight); \
+ align_buffer_64(dst_uv_c, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ align_buffer_64(dst_y_opt, kWidth * kHeight); \
+ align_buffer_64(dst_uv_opt, \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
+ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kHeight; ++i) \
+ for (int j = 0; j < kStride; ++j) \
+ src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_c, kWidth, \
+ dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
+ dst_y_opt, kWidth, \
+ dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kHeight; ++i) { \
+ for (int j = 0; j < kWidth; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
+ static_cast<int>(dst_y_opt[i * kWidth + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
+ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_uv_c[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
+ static_cast<int>(dst_uv_opt[i * \
+ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ } \
+ EXPECT_LE(max_diff, 4); \
+ free_aligned_buffer_64(dst_y_c) \
+ free_aligned_buffer_64(dst_uv_c) \
+ free_aligned_buffer_64(dst_y_opt) \
+ free_aligned_buffer_64(dst_uv_opt) \
+ free_aligned_buffer_64(src_argb) \
+}
+
+#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_ - 4, _Any, +, 0) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Unaligned, +, 1) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Invert, -, 0) \
+ TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
+ benchmark_width_, _Opt, +, 0)
+
+TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
+
+#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
+ W1280, DIFF, N, NEG, OFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \
+ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
+ const int kHeight = benchmark_height_; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
+ const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
+ align_buffer_64(src_argb, kStrideA * kHeightA + OFF); \
+ align_buffer_64(dst_argb_c, kStrideB * kHeightB); \
+ align_buffer_64(dst_argb_opt, kStrideB * kHeightB); \
+ memset(dst_argb_c, 0, kStrideB * kHeightB); \
+ memset(dst_argb_opt, 0, kStrideB * kHeightB); \
+ srandom(time(NULL)); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i + OFF] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
+ dst_argb_c, kStrideB, \
+ kWidth, NEG kHeight); \
+ MaskCpuFlags(-1); \
+ for (int i = 0; i < benchmark_iterations_; ++i) { \
+ FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
+ dst_argb_opt, kStrideB, \
+ kWidth, NEG kHeight); \
+ } \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_64(src_argb) \
+ free_aligned_buffer_64(dst_argb_c) \
+ free_aligned_buffer_64(dst_argb_opt) \
+}
+
+#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
+TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
+ srandom(time(NULL)); \
+ for (int times = 0; times < benchmark_iterations_; ++times) { \
+ const int kWidth = (random() & 63) + 1; \
+ const int kHeight = (random() & 31) + 1; \
+ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
+ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
+ const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
+ const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
+ align_buffer_page_end(src_argb, kStrideA * kHeightA); \
+ align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
+ align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
+ memset(dst_argb_c, 0, kStrideB * kHeightB); \
+ memset(dst_argb_opt, 0, kStrideB * kHeightB); \
+ for (int i = 0; i < kStrideA * kHeightA; ++i) { \
+ src_argb[i] = (random() & 0xff); \
+ } \
+ MaskCpuFlags(0); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, \
+ dst_argb_c, kStrideB, \
+ kWidth, kHeight); \
+ MaskCpuFlags(-1); \
+ FMT_A##To##FMT_B(src_argb, kStrideA, \
+ dst_argb_opt, kStrideB, \
+ kWidth, kHeight); \
+ int max_diff = 0; \
+ for (int i = 0; i < kStrideB * kHeightB; ++i) { \
+ int abs_diff = \
+ abs(static_cast<int>(dst_argb_c[i]) - \
+ static_cast<int>(dst_argb_opt[i])); \
+ if (abs_diff > max_diff) { \
+ max_diff = abs_diff; \
+ } \
+ } \
+ EXPECT_LE(max_diff, DIFF); \
+ free_aligned_buffer_page_end(src_argb) \
+ free_aligned_buffer_page_end(dst_argb_c) \
+ free_aligned_buffer_page_end(dst_argb_opt) \
+ } \
+}
+
+#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
+ benchmark_width_ - 4, DIFF, _Any, +, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
+ benchmark_width_, DIFF, _Unaligned, +, 1) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
+ benchmark_width_, DIFF, _Invert, -, 0) \
+ TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \
+ benchmark_width_, DIFF, _Opt, +, 0) \
+ TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
+ FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
+
+TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
+TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
+
+TEST_F(libyuvTest, Test565) {
+ SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 pixels565[256][2]);
+
+ for (int i = 0; i < 256; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ orig_pixels[i][j] = i;
+ }
+ }
+ ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
+ uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
+ EXPECT_EQ(610919429u, checksum);
+}
+
+#ifdef HAVE_JPEG
+TEST_F(libyuvTest, ValidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+ benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_64(orig_pixels, kSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+
+ // EOI, SOI. Expect pass.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
+ }
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+TEST_F(libyuvTest, InvalidateJpeg) {
+ const int kOff = 10;
+ const int kMinJpeg = 64;
+ const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
+ benchmark_width_ * benchmark_height_ : kMinJpeg;
+ const int kSize = kImageSize + kOff;
+ align_buffer_64(orig_pixels, kSize);
+
+ // No SOI or EOI. Expect fail.
+ memset(orig_pixels, 0, kSize);
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ // SOI but no EOI. Expect fail.
+ orig_pixels[0] = 0xff;
+ orig_pixels[1] = 0xd8; // SOI.
+ for (int times = 0; times < benchmark_iterations_; ++times) {
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+ }
+ // EOI but no SOI. Expect fail.
+ orig_pixels[0] = 0;
+ orig_pixels[1] = 0;
+ orig_pixels[kSize - kOff + 0] = 0xff;
+ orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
+ EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
+
+ free_aligned_buffer_page_end(orig_pixels);
+}
+
+#endif
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc
index 67c489cfc93..45579b8913e 100644
--- a/chromium/third_party/libyuv/unit_test/cpu_test.cc
+++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc
@@ -41,6 +41,8 @@ TEST_F(libyuvTest, TestCpuHas) {
printf("Has AVX2 %x\n", has_avx2);
int has_erms = TestCpuFlag(kCpuHasERMS);
printf("Has ERMS %x\n", has_erms);
+ int has_fma3 = TestCpuFlag(kCpuHasFMA3);
+ printf("Has FMA3 %x\n", has_fma3);
int has_mips = TestCpuFlag(kCpuHasMIPS);
printf("Has MIPS %x\n", has_mips);
int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP);
@@ -54,7 +56,7 @@ TEST_F(libyuvTest, TestCpuHas) {
TEST_F(libyuvTest, TestCpuId) {
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
- int cpu_info[4];
+ uint32 cpu_info[4];
// Vendor ID:
// AuthenticAMD AMD processor
// CentaurHauls Centaur processor
@@ -66,7 +68,7 @@ TEST_F(libyuvTest, TestCpuId) {
// RiseRiseRise Rise Technology processor
// SiS SiS SiS SiS processor
// UMC UMC UMC UMC processor
- CpuId(cpu_info, 0);
+ CpuId(0, 0, cpu_info);
cpu_info[0] = cpu_info[1]; // Reorder output
cpu_info[1] = cpu_info[3];
cpu_info[3] = 0;
@@ -81,7 +83,7 @@ TEST_F(libyuvTest, TestCpuId) {
// 13:12 - Processor Type
// 19:16 - Extended Model
// 27:20 - Extended Family
- CpuId(cpu_info, 1);
+ CpuId(1, 0, cpu_info);
int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
@@ -93,10 +95,8 @@ TEST_F(libyuvTest, TestCpuId) {
TEST_F(libyuvTest, TestLinuxNeon) {
int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt");
if (testdata) {
- EXPECT_EQ(0,
- ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
- EXPECT_EQ(kCpuHasNEON,
- ArmCpuCaps("unit_test/testdata/tegra3.txt"));
+ EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt"));
+ EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt"));
} else {
printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n");
}
diff --git a/chromium/third_party/libyuv/unit_test/math_test.cc b/chromium/third_party/libyuv/unit_test/math_test.cc
new file mode 100644
index 00000000000..4095c122eb6
--- /dev/null
+++ b/chromium/third_party/libyuv/unit_test/math_test.cc
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2013 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+#include "../unit_test/unit_test.h"
+
+namespace libyuv {
+
+TEST_F(libyuvTest, TestFixedDiv) {
+ int num[256];
+ int div[256];
+ int result_opt[256];
+ int result_c[256];
+
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
+ EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640));
+ EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640));
+ EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640));
+ EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640));
+ EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640));
+ EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640));
+ EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960));
+ EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640));
+ EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000));
+ EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000));
+ EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000));
+ EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000));
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095));
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096));
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097));
+ EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+ for (int i = 1; i < 4100; ++i) {
+ EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i));
+ EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i));
+ EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i));
+ EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i));
+ EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2));
+ EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1);
+ }
+ EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1));
+
+ srandom(time(NULL));
+ MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+ for (int j = 0; j < 256; ++j) {
+ if (div[j] == 0) {
+ div[j] = 1280;
+ }
+ }
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ for (int j = 0; j < 256; ++j) {
+ result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+ }
+ }
+ for (int j = 0; j < 256; ++j) {
+ result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+ EXPECT_NEAR(result_c[j], result_opt[j], 1);
+ }
+}
+
+TEST_F(libyuvTest, TestFixedDiv_Opt) {
+ int num[256];
+ int div[256];
+ int result_opt[256];
+ int result_c[256];
+
+ srandom(time(NULL));
+ MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+ MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+ for (int j = 0; j < 256; ++j) {
+ num[j] &= 4095; // Make numerator smaller.
+ div[j] &= 4095; // Make divisor smaller.
+ if (div[j] == 0) {
+ div[j] = 1280;
+ }
+ }
+
+ int has_x86 = TestCpuFlag(kCpuHasX86);
+ for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ if (has_x86) {
+ for (int j = 0; j < 256; ++j) {
+ result_opt[j] = libyuv::FixedDiv(num[j], div[j]);
+ }
+ } else {
+ for (int j = 0; j < 256; ++j) {
+ result_opt[j] = libyuv::FixedDiv_C(num[j], div[j]);
+ }
+ }
+ }
+ for (int j = 0; j < 256; ++j) {
+ result_c[j] = libyuv::FixedDiv_C(num[j], div[j]);
+ EXPECT_NEAR(result_c[j], result_opt[j], 1);
+ }
+}
+
+} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc
index 2c9958baae1..7759db406ff 100644
--- a/chromium/third_party/libyuv/unit_test/planar_test.cc
+++ b/chromium/third_party/libyuv/unit_test/planar_test.cc
@@ -32,77 +32,83 @@
namespace libyuv {
TEST_F(libyuvTest, TestAttenuate) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
- SIMD_ALIGNED(uint8 atten_pixels[256][4]);
- SIMD_ALIGNED(uint8 unatten_pixels[256][4]);
- SIMD_ALIGNED(uint8 atten2_pixels[256][4]);
+ const int kSize = 1280 * 4;
+ align_buffer_64(orig_pixels, kSize);
+ align_buffer_64(atten_pixels, kSize);
+ align_buffer_64(unatten_pixels, kSize);
+ align_buffer_64(atten2_pixels, kSize);
// Test unattenuation clamps
- orig_pixels[0][0] = 200u;
- orig_pixels[0][1] = 129u;
- orig_pixels[0][2] = 127u;
- orig_pixels[0][3] = 128u;
+ orig_pixels[0 * 4 + 0] = 200u;
+ orig_pixels[0 * 4 + 1] = 129u;
+ orig_pixels[0 * 4 + 2] = 127u;
+ orig_pixels[0 * 4 + 3] = 128u;
// Test unattenuation transparent and opaque are unaffected
- orig_pixels[1][0] = 16u;
- orig_pixels[1][1] = 64u;
- orig_pixels[1][2] = 192u;
- orig_pixels[1][3] = 0u;
- orig_pixels[2][0] = 16u;
- orig_pixels[2][1] = 64u;
- orig_pixels[2][2] = 192u;
- orig_pixels[2][3] = 255u;
- orig_pixels[3][0] = 16u;
- orig_pixels[3][1] = 64u;
- orig_pixels[3][2] = 192u;
- orig_pixels[3][3] = 128u;
- ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1);
- EXPECT_EQ(255u, unatten_pixels[0][0]);
- EXPECT_EQ(255u, unatten_pixels[0][1]);
- EXPECT_EQ(254u, unatten_pixels[0][2]);
- EXPECT_EQ(128u, unatten_pixels[0][3]);
- EXPECT_EQ(0u, unatten_pixels[1][0]);
- EXPECT_EQ(0u, unatten_pixels[1][1]);
- EXPECT_EQ(0u, unatten_pixels[1][2]);
- EXPECT_EQ(0u, unatten_pixels[1][3]);
- EXPECT_EQ(16u, unatten_pixels[2][0]);
- EXPECT_EQ(64u, unatten_pixels[2][1]);
- EXPECT_EQ(192u, unatten_pixels[2][2]);
- EXPECT_EQ(255u, unatten_pixels[2][3]);
- EXPECT_EQ(32u, unatten_pixels[3][0]);
- EXPECT_EQ(128u, unatten_pixels[3][1]);
- EXPECT_EQ(255u, unatten_pixels[3][2]);
- EXPECT_EQ(128u, unatten_pixels[3][3]);
-
- for (int i = 0; i < 256; ++i) {
- orig_pixels[i][0] = i;
- orig_pixels[i][1] = i / 2;
- orig_pixels[i][2] = i / 3;
- orig_pixels[i][3] = i;
- }
- ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1);
- ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1);
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1);
- }
- for (int i = 0; i < 256; ++i) {
- EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2);
- EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2);
- EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2);
- EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2);
+ orig_pixels[1 * 4 + 0] = 16u;
+ orig_pixels[1 * 4 + 1] = 64u;
+ orig_pixels[1 * 4 + 2] = 192u;
+ orig_pixels[1 * 4 + 3] = 0u;
+ orig_pixels[2 * 4 + 0] = 16u;
+ orig_pixels[2 * 4 + 1] = 64u;
+ orig_pixels[2 * 4 + 2] = 192u;
+ orig_pixels[2 * 4 + 3] = 255u;
+ orig_pixels[3 * 4 + 0] = 16u;
+ orig_pixels[3 * 4 + 1] = 64u;
+ orig_pixels[3 * 4 + 2] = 192u;
+ orig_pixels[3 * 4 + 3] = 128u;
+ ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1);
+ EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]);
+ EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]);
+ EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]);
+ EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]);
+ EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]);
+ EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]);
+ EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]);
+ EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]);
+ EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]);
+ EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]);
+ EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]);
+ EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]);
+ EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i * 4 + 0] = i;
+ orig_pixels[i * 4 + 1] = i / 2;
+ orig_pixels[i * 4 + 2] = i / 3;
+ orig_pixels[i * 4 + 3] = i;
+ }
+ ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 1280, 1);
+ ARGBUnattenuate(atten_pixels, 0, unatten_pixels, 0, 1280, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1);
+ }
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2);
+ EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2);
+ EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2);
+ EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2);
}
// Make sure transparent, 50% and opaque are fully accurate.
- EXPECT_EQ(0, atten_pixels[0][0]);
- EXPECT_EQ(0, atten_pixels[0][1]);
- EXPECT_EQ(0, atten_pixels[0][2]);
- EXPECT_EQ(0, atten_pixels[0][3]);
- EXPECT_EQ(64, atten_pixels[128][0]);
- EXPECT_EQ(32, atten_pixels[128][1]);
- EXPECT_EQ(21, atten_pixels[128][2]);
- EXPECT_EQ(128, atten_pixels[128][3]);
- EXPECT_NEAR(255, atten_pixels[255][0], 1);
- EXPECT_NEAR(127, atten_pixels[255][1], 1);
- EXPECT_NEAR(85, atten_pixels[255][2], 1);
- EXPECT_EQ(255, atten_pixels[255][3]);
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 0]);
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 1]);
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 2]);
+ EXPECT_EQ(0, atten_pixels[0 * 4 + 3]);
+ EXPECT_EQ(64, atten_pixels[128 * 4 + 0]);
+ EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
+ EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
+ EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
+ EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
+ EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
+ EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1);
+ EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
+
+ free_aligned_buffer_64(atten2_pixels)
+ free_aligned_buffer_64(unatten_pixels)
+ free_aligned_buffer_64(atten_pixels)
+ free_aligned_buffer_64(orig_pixels)
}
static int TestAttenuateI(int width, int height, int benchmark_iterations,
@@ -268,7 +274,9 @@ TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
}
TEST_F(libyuvTest, TestARGBGray) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
// Test blue
orig_pixels[0][0] = 255u;
orig_pixels[0][1] = 0u;
@@ -325,20 +333,22 @@ TEST_F(libyuvTest, TestARGBGray) {
EXPECT_EQ(96u, orig_pixels[5][1]);
EXPECT_EQ(96u, orig_pixels[5][2]);
EXPECT_EQ(224u, orig_pixels[5][3]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i;
}
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBGray(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
}
}
TEST_F(libyuvTest, TestARGBGrayTo) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
- SIMD_ALIGNED(uint8 gray_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 gray_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
// Test blue
orig_pixels[0][0] = 255u;
orig_pixels[0][1] = 0u;
@@ -395,19 +405,20 @@ TEST_F(libyuvTest, TestARGBGrayTo) {
EXPECT_EQ(96u, gray_pixels[5][1]);
EXPECT_EQ(96u, gray_pixels[5][2]);
EXPECT_EQ(224u, gray_pixels[5][3]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i;
}
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
}
}
TEST_F(libyuvTest, TestARGBSepia) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
// Test blue
orig_pixels[0][0] = 255u;
@@ -466,27 +477,106 @@ TEST_F(libyuvTest, TestARGBSepia) {
EXPECT_EQ(127u, orig_pixels[5][2]);
EXPECT_EQ(224u, orig_pixels[5][3]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i;
}
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 1280, 1);
}
}
TEST_F(libyuvTest, TestARGBColorMatrix) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
// Matrix for Sepia.
- static const int8 kARGBToSepia[] = {
+ SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
+ 17 / 2, 68 / 2, 35 / 2, 0,
+ 22 / 2, 88 / 2, 45 / 2, 0,
+ 24 / 2, 98 / 2, 50 / 2, 0,
+ 0, 0, 0, 64, // Copy alpha.
+ };
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kRGBToSepia[0], 16, 1);
+ EXPECT_EQ(31u, dst_pixels_opt[0][0]);
+ EXPECT_EQ(43u, dst_pixels_opt[0][1]);
+ EXPECT_EQ(47u, dst_pixels_opt[0][2]);
+ EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+ EXPECT_EQ(135u, dst_pixels_opt[1][0]);
+ EXPECT_EQ(175u, dst_pixels_opt[1][1]);
+ EXPECT_EQ(195u, dst_pixels_opt[1][2]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+ EXPECT_EQ(67u, dst_pixels_opt[2][0]);
+ EXPECT_EQ(87u, dst_pixels_opt[2][1]);
+ EXPECT_EQ(99u, dst_pixels_opt[2][2]);
+ EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+ EXPECT_EQ(87u, dst_pixels_opt[3][0]);
+ EXPECT_EQ(112u, dst_pixels_opt[3][1]);
+ EXPECT_EQ(127u, dst_pixels_opt[3][2]);
+ EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ MaskCpuFlags(0);
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+ &kRGBToSepia[0], 1280, 1);
+ MaskCpuFlags(-1);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kRGBToSepia[0], 1280, 1);
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+ EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+ EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+ EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+ }
+}
+
+TEST_F(libyuvTest, TestRGBColorMatrix) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+
+ // Matrix for Sepia.
+ SIMD_ALIGNED(static const int8 kRGBToSepia[]) = {
17, 68, 35, 0,
22, 88, 45, 0,
24, 98, 50, 0,
0, 0, 0, 0, // Unused but makes matrix 16 bytes.
};
+ memset(orig_pixels, 0, sizeof(orig_pixels));
// Test blue
orig_pixels[0][0] = 255u;
@@ -509,8 +599,8 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
orig_pixels[3][2] = 192u;
orig_pixels[3][3] = 224u;
// Do 16 to test asm version.
- ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1);
- EXPECT_EQ(33u, orig_pixels[0][0]);
+ RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1);
+ EXPECT_EQ(31u, orig_pixels[0][0]);
EXPECT_EQ(43u, orig_pixels[0][1]);
EXPECT_EQ(47u, orig_pixels[0][2]);
EXPECT_EQ(128u, orig_pixels[0][3]);
@@ -518,28 +608,28 @@ TEST_F(libyuvTest, TestARGBColorMatrix) {
EXPECT_EQ(175u, orig_pixels[1][1]);
EXPECT_EQ(195u, orig_pixels[1][2]);
EXPECT_EQ(0u, orig_pixels[1][3]);
- EXPECT_EQ(69u, orig_pixels[2][0]);
- EXPECT_EQ(89u, orig_pixels[2][1]);
+ EXPECT_EQ(67u, orig_pixels[2][0]);
+ EXPECT_EQ(87u, orig_pixels[2][1]);
EXPECT_EQ(99u, orig_pixels[2][2]);
EXPECT_EQ(255u, orig_pixels[2][3]);
- EXPECT_EQ(88u, orig_pixels[3][0]);
- EXPECT_EQ(114u, orig_pixels[3][1]);
+ EXPECT_EQ(87u, orig_pixels[3][0]);
+ EXPECT_EQ(112u, orig_pixels[3][1]);
EXPECT_EQ(127u, orig_pixels[3][2]);
EXPECT_EQ(224u, orig_pixels[3][3]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i;
}
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1);
}
}
TEST_F(libyuvTest, TestARGBColorTable) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels));
// Matrix for Sepia.
@@ -585,67 +675,127 @@ TEST_F(libyuvTest, TestARGBColorTable) {
EXPECT_EQ(11u, orig_pixels[3][2]);
EXPECT_EQ(16u, orig_pixels[3][3]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
+ }
+}
+
+// Same as TestARGBColorTable except alpha does not change.
+TEST_F(libyuvTest, TestRGBColorTable) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ // Matrix for Sepia.
+ static const uint8 kARGBTable[256 * 4] = {
+ 1u, 2u, 3u, 4u,
+ 5u, 6u, 7u, 8u,
+ 9u, 10u, 11u, 12u,
+ 13u, 14u, 15u, 16u,
+ };
+
+ orig_pixels[0][0] = 0u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 0u;
+ orig_pixels[1][0] = 1u;
+ orig_pixels[1][1] = 1u;
+ orig_pixels[1][2] = 1u;
+ orig_pixels[1][3] = 1u;
+ orig_pixels[2][0] = 2u;
+ orig_pixels[2][1] = 2u;
+ orig_pixels[2][2] = 2u;
+ orig_pixels[2][3] = 2u;
+ orig_pixels[3][0] = 0u;
+ orig_pixels[3][1] = 1u;
+ orig_pixels[3][2] = 2u;
+ orig_pixels[3][3] = 3u;
+ // Do 16 to test asm version.
+ RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1);
+ EXPECT_EQ(1u, orig_pixels[0][0]);
+ EXPECT_EQ(2u, orig_pixels[0][1]);
+ EXPECT_EQ(3u, orig_pixels[0][2]);
+ EXPECT_EQ(0u, orig_pixels[0][3]); // Alpha unchanged.
+ EXPECT_EQ(5u, orig_pixels[1][0]);
+ EXPECT_EQ(6u, orig_pixels[1][1]);
+ EXPECT_EQ(7u, orig_pixels[1][2]);
+ EXPECT_EQ(1u, orig_pixels[1][3]); // Alpha unchanged.
+ EXPECT_EQ(9u, orig_pixels[2][0]);
+ EXPECT_EQ(10u, orig_pixels[2][1]);
+ EXPECT_EQ(11u, orig_pixels[2][2]);
+ EXPECT_EQ(2u, orig_pixels[2][3]); // Alpha unchanged.
+ EXPECT_EQ(1u, orig_pixels[3][0]);
+ EXPECT_EQ(6u, orig_pixels[3][1]);
+ EXPECT_EQ(11u, orig_pixels[3][2]);
+ EXPECT_EQ(3u, orig_pixels[3][3]); // Alpha unchanged.
+
+ for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i;
}
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1);
}
}
TEST_F(libyuvTest, TestARGBQuantize) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i;
}
ARGBQuantize(&orig_pixels[0][0], 0,
- (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+ (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
- for (int i = 0; i < 256; ++i) {
- EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]);
- EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]);
- EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]);
- EXPECT_EQ(i, orig_pixels[i][3]);
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]);
+ EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]);
+ EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]);
+ EXPECT_EQ(i & 255, orig_pixels[i][3]);
}
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ARGBQuantize(&orig_pixels[0][0], 0,
- (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1);
+ (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1);
}
}
TEST_F(libyuvTest, TestARGBMirror) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
- SIMD_ALIGNED(uint8 dst_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels[1280][4]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2;
orig_pixels[i][2] = i / 3;
orig_pixels[i][3] = i / 4;
}
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
- for (int i = 0; i < 256; ++i) {
- EXPECT_EQ(i, dst_pixels[255 - i][0]);
- EXPECT_EQ(i / 2, dst_pixels[255 - i][1]);
- EXPECT_EQ(i / 3, dst_pixels[255 - i][2]);
- EXPECT_EQ(i / 4, dst_pixels[255 - i][3]);
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
+ EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
+ EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
+ EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
}
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
}
}
TEST_F(libyuvTest, TestShade) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
- SIMD_ALIGNED(uint8 shade_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 shade_pixels[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
orig_pixels[0][0] = 10u;
orig_pixels[0][1] = 20u;
@@ -694,16 +844,18 @@ TEST_F(libyuvTest, TestShade) {
EXPECT_EQ(5u, shade_pixels[0][2]);
EXPECT_EQ(5u, shade_pixels[0][3]);
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1,
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1,
0x80808080);
}
}
TEST_F(libyuvTest, TestInterpolate) {
- SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
- SIMD_ALIGNED(uint8 orig_pixels_1[256][4]);
- SIMD_ALIGNED(uint8 interpolate_pixels[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[1280][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels[1280][4]);
+ memset(orig_pixels_0, 0, sizeof(orig_pixels_0));
+ memset(orig_pixels_1, 0, sizeof(orig_pixels_1));
orig_pixels_0[0][0] = 16u;
orig_pixels_0[0][1] = 32u;
@@ -773,9 +925,9 @@ TEST_F(libyuvTest, TestInterpolate) {
EXPECT_EQ(16u, interpolate_pixels[0][2]);
EXPECT_EQ(32u, interpolate_pixels[0][3]);
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
- &interpolate_pixels[0][0], 0, 256, 1, 128);
+ &interpolate_pixels[0][0], 0, 1280, 1, 128);
}
}
@@ -841,7 +993,6 @@ TESTINTERPOLATE(64)
TESTINTERPOLATE(128)
TESTINTERPOLATE(192)
TESTINTERPOLATE(255)
-TESTINTERPOLATE(85)
static int TestBlend(int width, int height, int benchmark_iterations,
int invert, int off) {
@@ -919,10 +1070,10 @@ TEST_F(libyuvTest, ARGBBlend_Opt) {
}
TEST_F(libyuvTest, TestAffine) {
- SIMD_ALIGNED(uint8 orig_pixels_0[256][4]);
- SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]);
+ SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
for (int j = 0; j < 4; ++j) {
orig_pixels_0[i][j] = i;
}
@@ -931,42 +1082,42 @@ TEST_F(libyuvTest, TestAffine) {
float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f };
ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0],
- uv_step, 256);
+ uv_step, 1280);
EXPECT_EQ(0u, interpolate_pixels_C[0][0]);
EXPECT_EQ(96u, interpolate_pixels_C[128][0]);
EXPECT_EQ(191u, interpolate_pixels_C[255][3]);
#if defined(HAS_ARGBAFFINEROW_SSE2)
- SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]);
+ SIMD_ALIGNED(uint8 interpolate_pixels_Opt[1280][4]);
ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
- uv_step, 256);
- EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4));
+ uv_step, 1280);
+ EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4));
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
if (has_sse2) {
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0],
- uv_step, 256);
+ uv_step, 1280);
}
}
#endif
}
TEST_F(libyuvTest, TestSobelX) {
- SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]);
- SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
- SIMD_ALIGNED(uint8 orig_pixels_2[256 + 2]);
- SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
- SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
+ SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]);
+ SIMD_ALIGNED(uint8 orig_pixels_2[1280 + 2]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
- for (int i = 0; i < 256 + 2; ++i) {
+ for (int i = 0; i < 1280 + 2; ++i) {
orig_pixels_0[i] = i;
orig_pixels_1[i] = i * 2;
orig_pixels_2[i] = i * 3;
}
SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2,
- sobel_pixels_c, 256);
+ sobel_pixels_c, 1280);
EXPECT_EQ(16u, sobel_pixels_c[0]);
EXPECT_EQ(16u, sobel_pixels_c[100]);
@@ -975,9 +1126,9 @@ TEST_F(libyuvTest, TestSobelX) {
void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobely, int width) =
SobelXRow_C;
-#if defined(HAS_SOBELXROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- SobelXRow = SobelXRow_SSSE3;
+#if defined(HAS_SOBELXROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelXRow = SobelXRow_SSE2;
}
#endif
#if defined(HAS_SOBELXROW_NEON)
@@ -985,36 +1136,36 @@ TEST_F(libyuvTest, TestSobelX) {
SobelXRow = SobelXRow_NEON;
}
#endif
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2,
- sobel_pixels_opt, 256);
+ sobel_pixels_opt, 1280);
}
- for (int i = 0; i < 256; ++i) {
- EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
}
}
TEST_F(libyuvTest, TestSobelY) {
- SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]);
- SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]);
- SIMD_ALIGNED(uint8 sobel_pixels_c[256]);
- SIMD_ALIGNED(uint8 sobel_pixels_opt[256]);
+ SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]);
+ SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
- for (int i = 0; i < 256 + 2; ++i) {
+ for (int i = 0; i < 1280 + 2; ++i) {
orig_pixels_0[i] = i;
orig_pixels_1[i] = i * 2;
}
- SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 256);
+ SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 1280);
EXPECT_EQ(4u, sobel_pixels_c[0]);
EXPECT_EQ(255u, sobel_pixels_c[100]);
EXPECT_EQ(0u, sobel_pixels_c[255]);
void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) = SobelYRow_C;
-#if defined(HAS_SOBELYROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3)) {
- SobelYRow = SobelYRow_SSSE3;
+#if defined(HAS_SOBELYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelYRow = SobelYRow_SSE2;
}
#endif
#if defined(HAS_SOBELYROW_NEON)
@@ -1022,26 +1173,26 @@ TEST_F(libyuvTest, TestSobelY) {
SobelYRow = SobelYRow_NEON;
}
#endif
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 1280);
}
- for (int i = 0; i < 256; ++i) {
- EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
}
}
TEST_F(libyuvTest, TestSobel) {
- SIMD_ALIGNED(uint8 orig_sobelx[256]);
- SIMD_ALIGNED(uint8 orig_sobely[256]);
- SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
- SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
+ SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+ SIMD_ALIGNED(uint8 orig_sobely[1280]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_sobelx[i] = i;
orig_sobely[i] = i * 2;
}
- SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256);
+ SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
EXPECT_EQ(0u, sobel_pixels_c[0]);
EXPECT_EQ(3u, sobel_pixels_c[4]);
@@ -1066,26 +1217,64 @@ TEST_F(libyuvTest, TestSobel) {
SobelRow = SobelRow_NEON;
}
#endif
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
}
- for (int i = 0; i < 16; ++i) {
- EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ for (int i = 0; i < 1280 * 4; ++i) {
+ EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
+ }
+}
+
+TEST_F(libyuvTest, TestSobelToPlane) {
+ SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+ SIMD_ALIGNED(uint8 orig_sobely[1280]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[1280]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_sobelx[i] = i;
+ orig_sobely[i] = i * 2;
+ }
+
+ SobelToPlaneRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
+
+ EXPECT_EQ(0u, sobel_pixels_c[0]);
+ EXPECT_EQ(3u, sobel_pixels_c[1]);
+ EXPECT_EQ(6u, sobel_pixels_c[2]);
+ EXPECT_EQ(99u, sobel_pixels_c[33]);
+ EXPECT_EQ(255u, sobel_pixels_c[100]);
+ void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) = SobelToPlaneRow_C;
+#if defined(HAS_SOBELTOPLANEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2)) {
+ SobelToPlaneRow = SobelToPlaneRow_SSE2;
+ }
+#endif
+#if defined(HAS_SOBELTOPLANEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SobelToPlaneRow = SobelToPlaneRow_NEON;
+ }
+#endif
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ SobelToPlaneRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
+ }
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
}
}
TEST_F(libyuvTest, TestSobelXY) {
- SIMD_ALIGNED(uint8 orig_sobelx[256]);
- SIMD_ALIGNED(uint8 orig_sobely[256]);
- SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]);
- SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]);
+ SIMD_ALIGNED(uint8 orig_sobelx[1280]);
+ SIMD_ALIGNED(uint8 orig_sobely[1280]);
+ SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]);
+ SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]);
- for (int i = 0; i < 256; ++i) {
+ for (int i = 0; i < 1280; ++i) {
orig_sobelx[i] = i;
orig_sobely[i] = i * 2;
}
- SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256);
+ SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280);
EXPECT_EQ(0u, sobel_pixels_c[0]);
EXPECT_EQ(2u, sobel_pixels_c[4]);
@@ -1106,11 +1295,11 @@ TEST_F(libyuvTest, TestSobelXY) {
SobelXYRow = SobelXYRow_NEON;
}
#endif
- for (int i = 0; i < benchmark_pixels_div256_; ++i) {
- SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280);
}
- for (int i = 0; i < 16; ++i) {
- EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]);
+ for (int i = 0; i < 1280 * 4; ++i) {
+ EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]);
}
}
@@ -1165,8 +1354,6 @@ TEST_F(libyuvTest, TestCopyPlane) {
CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
}
opt_time = (get_time() - opt_time) / benchmark_iterations_;
- printf(" %8d us C - %8d us OPT\n",
- static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
for (i = 0; i < y_plane_size; ++i) {
if (dst_c[i] != dst_opt[i])
@@ -1403,6 +1590,7 @@ static int TestSobel(int width, int height, int benchmark_iterations,
align_buffer_64(src_argb_a, kStride * height + off);
align_buffer_64(dst_argb_c, kStride * height);
align_buffer_64(dst_argb_opt, kStride * height);
+ memset(src_argb_a, 0, kStride * height + off);
srandom(time(NULL));
for (int i = 0; i < kStride * height; ++i) {
src_argb_a[i + off] = (random() & 0xff);
@@ -1459,6 +1647,75 @@ TEST_F(libyuvTest, ARGBSobel_Opt) {
EXPECT_EQ(0, max_diff);
}
+static int TestSobelToPlane(int width, int height, int benchmark_iterations,
+ int invert, int off) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kSrcBpp = 4;
+ const int kDstBpp = 1;
+ const int kSrcStride = (width * kSrcBpp + 15) & ~15;
+ const int kDstStride = (width * kDstBpp + 15) & ~15;
+ align_buffer_64(src_argb_a, kSrcStride * height + off);
+ align_buffer_64(dst_argb_c, kDstStride * height);
+ align_buffer_64(dst_argb_opt, kDstStride * height);
+ memset(src_argb_a, 0, kSrcStride * height + off);
+ srandom(time(NULL));
+ for (int i = 0; i < kSrcStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ }
+ memset(dst_argb_c, 0, kDstStride * height);
+ memset(dst_argb_opt, 0, kDstStride * height);
+
+ MaskCpuFlags(0);
+ ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+ dst_argb_c, kDstStride,
+ width, invert * height);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBSobelToPlane(src_argb_a + off, kSrcStride,
+ dst_argb_opt, kDstStride,
+ width, invert * height);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kDstStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Any) {
+ int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Unaligned) {
+ int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Invert) {
+ int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
+TEST_F(libyuvTest, ARGBSobelToPlane_Opt) {
+ int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0);
+ EXPECT_EQ(0, max_diff);
+}
+
static int TestSobelXY(int width, int height, int benchmark_iterations,
int invert, int off) {
if (width < 1) {
@@ -1469,6 +1726,7 @@ static int TestSobelXY(int width, int height, int benchmark_iterations,
align_buffer_64(src_argb_a, kStride * height + off);
align_buffer_64(dst_argb_c, kStride * height);
align_buffer_64(dst_argb_opt, kStride * height);
+ memset(src_argb_a, 0, kStride * height + off);
srandom(time(NULL));
for (int i = 0; i < kStride * height; ++i) {
src_argb_a[i + off] = (random() & 0xff);
@@ -1525,4 +1783,326 @@ TEST_F(libyuvTest, ARGBSobelXY_Opt) {
EXPECT_EQ(0, max_diff);
}
+static int TestBlur(int width, int height, int benchmark_iterations,
+ int invert, int off, int radius) {
+ if (width < 1) {
+ width = 1;
+ }
+ const int kBpp = 4;
+ const int kStride = (width * kBpp + 15) & ~15;
+ align_buffer_64(src_argb_a, kStride * height + off);
+ align_buffer_64(dst_cumsum, width * height * 16);
+ align_buffer_64(dst_argb_c, kStride * height);
+ align_buffer_64(dst_argb_opt, kStride * height);
+ srandom(time(NULL));
+ for (int i = 0; i < kStride * height; ++i) {
+ src_argb_a[i + off] = (random() & 0xff);
+ }
+ memset(dst_cumsum, 0, width * height * 16);
+ memset(dst_argb_c, 0, kStride * height);
+ memset(dst_argb_opt, 0, kStride * height);
+
+ MaskCpuFlags(0);
+ ARGBBlur(src_argb_a + off, kStride,
+ dst_argb_c, kStride,
+ reinterpret_cast<int32*>(dst_cumsum), width * 4,
+ width, invert * height, radius);
+ MaskCpuFlags(-1);
+ for (int i = 0; i < benchmark_iterations; ++i) {
+ ARGBBlur(src_argb_a + off, kStride,
+ dst_argb_opt, kStride,
+ reinterpret_cast<int32*>(dst_cumsum), width * 4,
+ width, invert * height, radius);
+ }
+ int max_diff = 0;
+ for (int i = 0; i < kStride * height; ++i) {
+ int abs_diff =
+ abs(static_cast<int>(dst_argb_c[i]) -
+ static_cast<int>(dst_argb_opt[i]));
+ if (abs_diff > max_diff) {
+ max_diff = abs_diff;
+ }
+ }
+ free_aligned_buffer_64(src_argb_a)
+ free_aligned_buffer_64(dst_cumsum)
+ free_aligned_buffer_64(dst_argb_c)
+ free_aligned_buffer_64(dst_argb_opt)
+ return max_diff;
+}
+
+static const int kBlurSize = 55;
+TEST_F(libyuvTest, ARGBBlur_Any) {
+ int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlur_Unaligned) {
+ int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlur_Invert) {
+ int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlur_Opt) {
+ int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0, kBlurSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+static const int kBlurSmallSize = 5;
+TEST_F(libyuvTest, ARGBBlurSmall_Any) {
+ int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_,
+ benchmark_iterations_, +1, 0, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlurSmall_Unaligned) {
+ int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 1, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlurSmall_Invert) {
+ int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, -1, 0, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, ARGBBlurSmall_Opt) {
+ int max_diff = TestBlur(benchmark_width_, benchmark_height_,
+ benchmark_iterations_, +1, 0, kBlurSmallSize);
+ EXPECT_LE(max_diff, 1);
+}
+
+TEST_F(libyuvTest, TestARGBPolynomial) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = {
+ 0.94230f, -3.03300f, -2.92500f, 0.f, // C0
+ 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x
+ 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x
+ 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x
+ };
+
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test white
+ orig_pixels[3][0] = 255u;
+ orig_pixels[3][1] = 255u;
+ orig_pixels[3][2] = 255u;
+ orig_pixels[3][3] = 255u;
+ // Test color
+ orig_pixels[4][0] = 16u;
+ orig_pixels[4][1] = 64u;
+ orig_pixels[4][2] = 192u;
+ orig_pixels[4][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kWarmifyPolynomial[0], 16, 1);
+ EXPECT_EQ(235u, dst_pixels_opt[0][0]);
+ EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+ EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+ EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+ EXPECT_EQ(233u, dst_pixels_opt[1][1]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+ EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+ EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+ EXPECT_EQ(241u, dst_pixels_opt[2][2]);
+ EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+ EXPECT_EQ(235u, dst_pixels_opt[3][0]);
+ EXPECT_EQ(233u, dst_pixels_opt[3][1]);
+ EXPECT_EQ(241u, dst_pixels_opt[3][2]);
+ EXPECT_EQ(255u, dst_pixels_opt[3][3]);
+ EXPECT_EQ(10u, dst_pixels_opt[4][0]);
+ EXPECT_EQ(59u, dst_pixels_opt[4][1]);
+ EXPECT_EQ(188u, dst_pixels_opt[4][2]);
+ EXPECT_EQ(224u, dst_pixels_opt[4][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ MaskCpuFlags(0);
+ ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+ &kWarmifyPolynomial[0], 1280, 1);
+ MaskCpuFlags(-1);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &kWarmifyPolynomial[0], 1280, 1);
+ }
+
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+ EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+ EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+ EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+ }
+}
+
+TEST_F(libyuvTest, TestARGBLumaColorTable) {
+ SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
+ SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]);
+ memset(orig_pixels, 0, sizeof(orig_pixels));
+
+ align_buffer_64(lumacolortable, 32768);
+ int v = 0;
+ for (int i = 0; i < 32768; ++i) {
+ lumacolortable[i] = v;
+ v += 3;
+ }
+ // Test blue
+ orig_pixels[0][0] = 255u;
+ orig_pixels[0][1] = 0u;
+ orig_pixels[0][2] = 0u;
+ orig_pixels[0][3] = 128u;
+ // Test green
+ orig_pixels[1][0] = 0u;
+ orig_pixels[1][1] = 255u;
+ orig_pixels[1][2] = 0u;
+ orig_pixels[1][3] = 0u;
+ // Test red
+ orig_pixels[2][0] = 0u;
+ orig_pixels[2][1] = 0u;
+ orig_pixels[2][2] = 255u;
+ orig_pixels[2][3] = 255u;
+ // Test color
+ orig_pixels[3][0] = 16u;
+ orig_pixels[3][1] = 64u;
+ orig_pixels[3][2] = 192u;
+ orig_pixels[3][3] = 224u;
+ // Do 16 to test asm version.
+ ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ &lumacolortable[0], 16, 1);
+ EXPECT_EQ(253u, dst_pixels_opt[0][0]);
+ EXPECT_EQ(0u, dst_pixels_opt[0][1]);
+ EXPECT_EQ(0u, dst_pixels_opt[0][2]);
+ EXPECT_EQ(128u, dst_pixels_opt[0][3]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][0]);
+ EXPECT_EQ(253u, dst_pixels_opt[1][1]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][2]);
+ EXPECT_EQ(0u, dst_pixels_opt[1][3]);
+ EXPECT_EQ(0u, dst_pixels_opt[2][0]);
+ EXPECT_EQ(0u, dst_pixels_opt[2][1]);
+ EXPECT_EQ(253u, dst_pixels_opt[2][2]);
+ EXPECT_EQ(255u, dst_pixels_opt[2][3]);
+ EXPECT_EQ(48u, dst_pixels_opt[3][0]);
+ EXPECT_EQ(192u, dst_pixels_opt[3][1]);
+ EXPECT_EQ(64u, dst_pixels_opt[3][2]);
+ EXPECT_EQ(224u, dst_pixels_opt[3][3]);
+
+ for (int i = 0; i < 1280; ++i) {
+ orig_pixels[i][0] = i;
+ orig_pixels[i][1] = i / 2;
+ orig_pixels[i][2] = i / 3;
+ orig_pixels[i][3] = i;
+ }
+
+ MaskCpuFlags(0);
+ ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0,
+ lumacolortable, 1280, 1);
+ MaskCpuFlags(-1);
+
+ for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+ ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0,
+ lumacolortable, 1280, 1);
+ }
+ for (int i = 0; i < 1280; ++i) {
+ EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]);
+ EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]);
+ EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]);
+ EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]);
+ }
+
+ free_aligned_buffer_64(lumacolortable);
+}
+
+TEST_F(libyuvTest, TestARGBCopyAlpha) {
+ const int kSize = benchmark_width_ * benchmark_height_ * 4;
+ align_buffer_64(orig_pixels, kSize);
+ align_buffer_64(dst_pixels_opt, kSize);
+ align_buffer_64(dst_pixels_c, kSize);
+
+ MemRandomize(orig_pixels, kSize);
+ MemRandomize(dst_pixels_opt, kSize);
+ memcpy(dst_pixels_c, dst_pixels_opt, kSize);
+
+ MaskCpuFlags(0);
+ ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+ dst_pixels_c, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(-1);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4,
+ dst_pixels_opt, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_);
+ }
+ for (int i = 0; i < kSize; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_64(dst_pixels_c)
+ free_aligned_buffer_64(dst_pixels_opt)
+ free_aligned_buffer_64(orig_pixels)
+}
+
+TEST_F(libyuvTest, TestARGBCopyYToAlpha) {
+ const int kPixels = benchmark_width_ * benchmark_height_;
+ align_buffer_64(orig_pixels, kPixels);
+ align_buffer_64(dst_pixels_opt, kPixels * 4);
+ align_buffer_64(dst_pixels_c, kPixels * 4);
+
+ MemRandomize(orig_pixels, kPixels);
+ MemRandomize(dst_pixels_opt, kPixels * 4);
+ memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4);
+
+ MaskCpuFlags(0);
+ ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+ dst_pixels_c, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_);
+ MaskCpuFlags(-1);
+
+ for (int i = 0; i < benchmark_iterations_; ++i) {
+ ARGBCopyYToAlpha(orig_pixels, benchmark_width_,
+ dst_pixels_opt, benchmark_width_ * 4,
+ benchmark_width_, benchmark_height_);
+ }
+ for (int i = 0; i < kPixels * 4; ++i) {
+ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+ }
+
+ free_aligned_buffer_64(dst_pixels_c)
+ free_aligned_buffer_64(dst_pixels_opt)
+ free_aligned_buffer_64(orig_pixels)
+}
+
} // namespace libyuv
diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
index 7a4758594a0..ea4d4d14580 100644
--- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc
@@ -17,10 +17,6 @@
namespace libyuv {
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int ARGBTestFilter(int src_width, int src_height,
int dst_width, int dst_height,
@@ -85,7 +81,7 @@ static int ARGBTestFilter(int src_width, int src_height,
int max_diff = 0;
for (i = b; i < (dst_height + b); ++i) {
for (j = b * 4; j < (dst_width + b) * 4; ++j) {
- int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
dst_argb_opt[(i * dst_stride_argb) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
@@ -99,8 +95,8 @@ static int ARGBTestFilter(int src_width, int src_height,
return max_diff;
}
-static const int kTileX = 16;
-static const int kTileY = 16;
+static const int kTileX = 8;
+static const int kTileY = 8;
static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
@@ -184,7 +180,7 @@ static int ARGBClipTestFilter(int src_width, int src_height,
int max_diff = 0;
for (i = b; i < (dst_height + b); ++i) {
for (j = b * 4; j < (dst_width + b) * 4; ++j) {
- int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] -
+ int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] -
dst_argb_opt[(i * dst_stride_argb) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
@@ -198,78 +194,83 @@ static int ARGBClipTestFilter(int src_width, int src_height,
return max_diff;
}
-#define TEST_FACTOR1(name, filter, factor, max_diff) \
+#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff) \
TEST_F(libyuvTest, ARGBScaleDownBy##name##_##filter) { \
int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
- Abs(benchmark_width_) / factor, \
- Abs(benchmark_height_) / factor, \
+ Abs(benchmark_width_) * hfactor, \
+ Abs(benchmark_height_) * vfactor, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
TEST_F(libyuvTest, ARGBScaleDownClipBy##name##_##filter) { \
int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
- Abs(benchmark_width_) / factor, \
- Abs(benchmark_height_) / factor, \
+ Abs(benchmark_width_) * hfactor, \
+ Abs(benchmark_height_) * vfactor, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
-// Test a scale factor with all 2 filters. Expect unfiltered to be exact, but
+// Test a scale factor with 2 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, factor) \
- TEST_FACTOR1(name, None, factor, 0) \
- TEST_FACTOR1(name, Bilinear, factor, 2)
+#define TEST_FACTOR(name, hfactor, vfactor) \
+ TEST_FACTOR1(name, None, hfactor, vfactor, 2) \
+ TEST_FACTOR1(name, Linear, hfactor, vfactor, 2) \
+ TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 2) \
+ TEST_FACTOR1(name, Box, hfactor, vfactor, 2)
// TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
-TEST_FACTOR(1, 1)
-TEST_FACTOR(2, 2)
-TEST_FACTOR(4, 4)
-TEST_FACTOR(5, 5)
-TEST_FACTOR(8, 8)
-TEST_FACTOR(16, 16)
-TEST_FACTOR(2by3, 2 / 3)
-TEST_FACTOR(3by4, 3 / 4)
-TEST_FACTOR(3by8, 3 / 8)
+TEST_FACTOR(1, 1 / 1, 1 / 1)
+TEST_FACTOR(2, 1 / 2, 1 / 2)
+TEST_FACTOR(4, 1 / 4, 1 / 4)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(16, 1 / 16, 1 / 16)
+TEST_FACTOR(2by3, 2 / 3, 2 / 3)
+TEST_FACTOR(3by4, 3 / 4, 3 / 4)
+TEST_FACTOR(3by8, 3 / 8, 3 / 8)
+TEST_FACTOR(Vertical2by3, 1, 2 / 3)
#undef TEST_FACTOR1
#undef TEST_FACTOR
-#define TEST_SCALETO1(width, height, filter, max_diff) \
- TEST_F(libyuvTest, ARGBScaleTo##width##x##height##_##filter) { \
+#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+ TEST_F(libyuvTest, name##To##width##x##height##_##filter) { \
int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(libyuvTest, ARGBScaleFrom##width##x##height##_##filter) { \
+ TEST_F(libyuvTest, name##From##width##x##height##_##filter) { \
int diff = ARGBTestFilter(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(libyuvTest, ARGBScaleClipTo##width##x##height##_##filter) { \
+ TEST_F(libyuvTest, name##ClipTo##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(libyuvTest, ARGBScaleClipFrom##width##x##height##_##filter) { \
+ TEST_F(libyuvTest, name##ClipFrom##width##x##height##_##filter) { \
int diff = ARGBClipTestFilter(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
-// Test scale to a specified size with all 3 filters.
-#define TEST_SCALETO(width, height) \
- TEST_SCALETO1(width, height, None, 0) \
- TEST_SCALETO1(width, height, Bilinear, 2)
-
-TEST_SCALETO(640, 360)
-TEST_SCALETO(853, 480)
-TEST_SCALETO(1280, 720)
-TEST_SCALETO(1280, 800)
-TEST_SCALETO(1366, 768)
-TEST_SCALETO(1920, 1080)
+/// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 3) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(name, width, height, Box, 3)
+
+TEST_SCALETO(ARGBScale, 1, 1)
+TEST_SCALETO(ARGBScale, 320, 240)
+TEST_SCALETO(ARGBScale, 352, 288)
+TEST_SCALETO(ARGBScale, 640, 360)
+TEST_SCALETO(ARGBScale, 853, 480)
+TEST_SCALETO(ARGBScale, 1280, 720)
+TEST_SCALETO(ARGBScale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc
index 769151aa232..c6f25604608 100644
--- a/chromium/third_party/libyuv/unit_test/scale_test.cc
+++ b/chromium/third_party/libyuv/unit_test/scale_test.cc
@@ -17,10 +17,6 @@
namespace libyuv {
-static __inline int Abs(int v) {
- return v >= 0 ? v : -v;
-}
-
// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
static int TestFilter(int src_width, int src_height,
int dst_width, int dst_height,
@@ -99,7 +95,7 @@ static int TestFilter(int src_width, int src_height,
int max_diff = 0;
for (i = b; i < (dst_height + b); ++i) {
for (j = b; j < (dst_width + b); ++j) {
- int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+ int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
dst_y_opt[(i * dst_stride_y) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
@@ -109,12 +105,12 @@ static int TestFilter(int src_width, int src_height,
for (i = b; i < (dst_height_uv + b); ++i) {
for (j = b; j < (dst_width_uv + b); ++j) {
- int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+ int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
dst_u_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
- abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+ abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
dst_v_opt[(i * dst_stride_uv) + j]);
if (abs_diff > max_diff) {
max_diff = abs_diff;
@@ -136,61 +132,64 @@ static int TestFilter(int src_width, int src_height,
return max_diff;
}
-#define TEST_FACTOR1(name, filter, factor, max_diff) \
+#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff) \
TEST_F(libyuvTest, ScaleDownBy##name##_##filter) { \
int diff = TestFilter(benchmark_width_, benchmark_height_, \
- Abs(benchmark_width_) / factor, \
- Abs(benchmark_height_) / factor, \
+ Abs(benchmark_width_) * hfactor, \
+ Abs(benchmark_height_) * vfactor, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
-// Test a scale factor with all 3 filters. Expect unfiltered to be exact, but
+// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
// filtering is different fixed point implementations for SSSE3, Neon and C.
-#define TEST_FACTOR(name, factor) \
- TEST_FACTOR1(name, None, factor, 0) \
- TEST_FACTOR1(name, Bilinear, factor, 2) \
- TEST_FACTOR1(name, Box, factor, 2) \
+#define TEST_FACTOR(name, hfactor, vfactor) \
+ TEST_FACTOR1(name, None, hfactor, vfactor, 0) \
+ TEST_FACTOR1(name, Linear, hfactor, vfactor, 3) \
+ TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 3) \
+ TEST_FACTOR1(name, Box, hfactor, vfactor, 3) \
// TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2.
-TEST_FACTOR(1, 1)
-TEST_FACTOR(2, 2)
-TEST_FACTOR(4, 4)
-TEST_FACTOR(5, 5)
-TEST_FACTOR(8, 8)
-TEST_FACTOR(16, 16)
-TEST_FACTOR(2by3, 2 / 3)
-TEST_FACTOR(3by4, 3 / 4)
-TEST_FACTOR(3by8, 3 / 8)
+TEST_FACTOR(1, 1 / 1, 1 / 1)
+TEST_FACTOR(2, 1 / 2, 1 / 2)
+TEST_FACTOR(4, 1 / 4, 1 / 4)
+TEST_FACTOR(8, 1 / 8, 1 / 8)
+TEST_FACTOR(16, 1 / 16, 1 / 16)
+TEST_FACTOR(2by3, 2 / 3, 2 / 3)
+TEST_FACTOR(3by4, 3 / 4, 3 / 4)
+TEST_FACTOR(3by8, 3 / 8, 3 / 8)
+TEST_FACTOR(Vertical2by3, 1, 2 / 3)
#undef TEST_FACTOR1
#undef TEST_FACTOR
-#define TEST_SCALETO1(width, height, filter, max_diff) \
- TEST_F(libyuvTest, ScaleTo##width##x##height##_##filter) { \
+#define TEST_SCALETO1(name, width, height, filter, max_diff) \
+ TEST_F(libyuvTest, name##To##width##x##height##_##filter) { \
int diff = TestFilter(benchmark_width_, benchmark_height_, \
width, height, \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
} \
- TEST_F(libyuvTest, ScaleFrom##width##x##height##_##filter) { \
+ TEST_F(libyuvTest, name##From##width##x##height##_##filter) { \
int diff = TestFilter(width, height, \
Abs(benchmark_width_), Abs(benchmark_height_), \
kFilter##filter, benchmark_iterations_); \
EXPECT_LE(diff, max_diff); \
}
-// Test scale to a specified size with all 3 filters.
-#define TEST_SCALETO(width, height) \
- TEST_SCALETO1(width, height, None, 0) \
- TEST_SCALETO1(width, height, Bilinear, 2) \
- TEST_SCALETO1(width, height, Box, 2) \
-
-TEST_SCALETO(640, 360)
-TEST_SCALETO(853, 480)
-TEST_SCALETO(1280, 720)
-TEST_SCALETO(1280, 800)
-TEST_SCALETO(1366, 768)
-TEST_SCALETO(1920, 1080)
+// Test scale to a specified size with all 4 filters.
+#define TEST_SCALETO(name, width, height) \
+ TEST_SCALETO1(name, width, height, None, 0) \
+ TEST_SCALETO1(name, width, height, Linear, 3) \
+ TEST_SCALETO1(name, width, height, Bilinear, 3) \
+ TEST_SCALETO1(name, width, height, Box, 3)
+
+TEST_SCALETO(Scale, 1, 1)
+TEST_SCALETO(Scale, 320, 240)
+TEST_SCALETO(Scale, 352, 288)
+TEST_SCALETO(Scale, 640, 360)
+TEST_SCALETO(Scale, 853, 480)
+TEST_SCALETO(Scale, 1280, 720)
+TEST_SCALETO(Scale, 1920, 1080)
#undef TEST_SCALETO1
#undef TEST_SCALETO
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.cc b/chromium/third_party/libyuv/unit_test/unit_test.cc
index fac70262133..b11bd246313 100644
--- a/chromium/third_party/libyuv/unit_test/unit_test.cc
+++ b/chromium/third_party/libyuv/unit_test/unit_test.cc
@@ -19,8 +19,8 @@
#define BENCHMARK_ITERATIONS 1
libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
- benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128),
- benchmark_height_(72) {
+ benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(22),
+ benchmark_height_(14) {
const char* repeat = getenv("LIBYUV_REPEAT");
if (repeat) {
benchmark_iterations_ = atoi(repeat); // NOLINT
@@ -39,9 +39,14 @@ libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128),
if (height) {
benchmark_height_ = atoi(height); // NOLINT
}
- benchmark_pixels_div256_ = static_cast<int>(
- (static_cast<double>(benchmark_width_ *
- benchmark_height_) * benchmark_iterations_ + 255.0) / 256.0);
+ benchmark_pixels_div256_ = static_cast<int>((
+ static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) + 255.0) / 256.0);
+ benchmark_pixels_div1280_ = static_cast<int>((
+ static_cast<double>(Abs(benchmark_width_)) *
+ static_cast<double>(Abs(benchmark_height_)) *
+ static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0);
}
int main(int argc, char** argv) {
diff --git a/chromium/third_party/libyuv/unit_test/unit_test.h b/chromium/third_party/libyuv/unit_test/unit_test.h
index e81aea30780..89b333bdd59 100644
--- a/chromium/third_party/libyuv/unit_test/unit_test.h
+++ b/chromium/third_party/libyuv/unit_test/unit_test.h
@@ -11,10 +11,21 @@
#ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT
#define UNIT_TEST_UNIT_TEST_H_
+#ifdef WIN32
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
#include <gtest/gtest.h>
#include "libyuv/basic_types.h"
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
#define align_buffer_64(var, size) \
uint8* var; \
uint8* var##_mem; \
@@ -38,7 +49,6 @@
var = 0;
#ifdef WIN32
-#include <windows.h>
static inline double get_time() {
LARGE_INTEGER t, f;
QueryPerformanceCounter(&t);
@@ -49,10 +59,6 @@ static inline double get_time() {
#define random rand
#define srandom srand
#else
-
-#include <sys/time.h>
-#include <sys/resource.h>
-
static inline double get_time() {
struct timeval t;
struct timezone tzp;
@@ -63,9 +69,9 @@ static inline double get_time() {
static inline void MemRandomize(uint8* dst, int len) {
int i;
- for (i = 0; i < len - 3; i += 4) {
- *reinterpret_cast<uint32*>(dst) = random();
- dst += 4;
+ for (i = 0; i < len - 1; i += 2) {
+ *reinterpret_cast<uint16*>(dst) = random();
+ dst += 2;
}
for (; i < len; ++i) {
*dst++ = random();
@@ -83,6 +89,7 @@ class libyuvTest : public ::testing::Test {
int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA.
int benchmark_height_; // Default 720. Use 360 for benchmarking VGA.
int benchmark_pixels_div256_; // Total pixels to benchmark / 256.
+ int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280.
};
#endif // UNIT_TEST_UNIT_TEST_H_ NOLINT
diff --git a/chromium/third_party/libyuv/util/convert.cc b/chromium/third_party/libyuv/util/convert.cc
index 18316ef8efb..5f071416da4 100644
--- a/chromium/third_party/libyuv/util/convert.cc
+++ b/chromium/third_party/libyuv/util/convert.cc
@@ -155,8 +155,8 @@ void ParseOptions(int argc, const char* argv[]) {
}
}
-static const int kTileX = 12;
-static const int kTileY = 8;
+static const int kTileX = 32;
+static const int kTileY = 32;
static int TileARGBScale(const uint8* src_argb, int src_stride_argb,
int src_width, int src_height,
diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c
index 8d8529ba7c6..db22871ea50 100644
--- a/chromium/third_party/libyuv/util/cpuid.c
+++ b/chromium/third_party/libyuv/util/cpuid.c
@@ -25,7 +25,7 @@ int main(int argc, const char* argv[]) {
#if defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_X64)
if (has_x86) {
- int family, model, cpu_info[4];
+ uint32 family, model, cpu_info[4];
// Vendor ID:
// AuthenticAMD AMD processor
// CentaurHauls Centaur processor
@@ -37,7 +37,7 @@ int main(int argc, const char* argv[]) {
// RiseRiseRise Rise Technology processor
// SiS SiS SiS SiS processor
// UMC UMC UMC UMC processor
- CpuId(cpu_info, 0);
+ CpuId(0, 0, &cpu_info[0]);
cpu_info[0] = cpu_info[1]; // Reorder output
cpu_info[1] = cpu_info[3];
cpu_info[3] = 0;
@@ -50,7 +50,7 @@ int main(int argc, const char* argv[]) {
// 13:12 - Processor Type
// 19:16 - Extended Model
// 27:20 - Extended Family
- CpuId(cpu_info, 1);
+ CpuId(1, 0, &cpu_info[0]);
family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0);
model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0);
printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family,
@@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) {
int has_avx = TestCpuFlag(kCpuHasAVX);
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_erms = TestCpuFlag(kCpuHasERMS);
+ int has_fma3 = TestCpuFlag(kCpuHasFMA3);
printf("Has SSE2 %x\n", has_sse2);
printf("Has SSSE3 %x\n", has_ssse3);
printf("Has SSE4.1 %x\n", has_sse41);
@@ -86,6 +87,7 @@ int main(int argc, const char* argv[]) {
printf("Has AVX %x\n", has_avx);
printf("Has AVX2 %x\n", has_avx2);
printf("Has ERMS %x\n", has_erms);
+ printf("Has FMA3 %x\n", has_fma3);
}
return 0;
}
diff --git a/chromium/third_party/libyuv/util/psnr.h b/chromium/third_party/libyuv/util/psnr.h
index 2cd0b1457ce..370337a75f2 100644
--- a/chromium/third_party/libyuv/util/psnr.h
+++ b/chromium/third_party/libyuv/util/psnr.h
@@ -10,7 +10,7 @@
// Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
-#ifndef UTIL_PSNR_H_
+#ifndef UTIL_PSNR_H_ // NOLINT
#define UTIL_PSNR_H_
#ifdef __cplusplus
@@ -36,4 +36,4 @@ double ComputeSumSquareError(const uint8* org, const uint8* rec, int size);
} // extern "C"
#endif
-#endif // UTIL_PSNR_H_
+#endif // UTIL_PSNR_H_ // NOLINT
diff --git a/chromium/third_party/libyuv/util/ssim.cc b/chromium/third_party/libyuv/util/ssim.cc
index 277561dd00d..d07889a8ac8 100644
--- a/chromium/third_party/libyuv/util/ssim.cc
+++ b/chromium/third_party/libyuv/util/ssim.cc
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "./ssim.h"
+#include "../util/ssim.h" // NOLINT
#include <math.h>
#include <string.h>
diff --git a/chromium/third_party/libyuv/util/ssim.h b/chromium/third_party/libyuv/util/ssim.h
index 0689276addc..40120b4f4e7 100644
--- a/chromium/third_party/libyuv/util/ssim.h
+++ b/chromium/third_party/libyuv/util/ssim.h
@@ -10,7 +10,7 @@
// Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format
-#ifndef UTIL_SSIM_H_
+#ifndef UTIL_SSIM_H_ // NOLINT
#define UTIL_SSIM_H_
#ifdef __cplusplus
@@ -32,4 +32,4 @@ double CalcLSSIM(double ssim);
} // extern "C"
#endif
-#endif // UTIL_SSIM_H_
+#endif // UTIL_SSIM_H_ // NOLINT
diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk
new file mode 100644
index 00000000000..2638608ebc9
--- /dev/null
+++ b/chromium/third_party/libyuv/winarm.mk
@@ -0,0 +1,43 @@
+# This is a generic makefile for libyuv for Windows Arm.
+# nmake /f winarm.mk
+# make -f winarm.mk
+# nmake /f winarm.mk clean
+# consider /arch:ARMv7VE
+CC=cl
+CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP
+AR=lib
+ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE
+RM=cmd /c del
+
+LOCAL_OBJ_FILES = \
+ source/compare.o\
+ source/compare_common.o\
+ source/convert.o\
+ source/convert_argb.o\
+ source/convert_from.o\
+ source/convert_from_argb.o\
+ source/convert_to_argb.o\
+ source/convert_to_i420.o\
+ source/cpu_id.o\
+ source/format_conversion.o\
+ source/planar_functions.o\
+ source/rotate.o\
+ source/rotate_argb.o\
+ source/row_any.o\
+ source/row_common.o\
+ source/scale.o\
+ source/scale_argb.o\
+ source/scale_common.o\
+ source/video_common.o
+
+.cc.o:
+ $(CC) /c $(CCFLAGS) $*.cc /Fo$@
+
+all: libyuv_arm.lib winarm.mk
+
+libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk
+ $(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES)
+
+clean:
+ $(RM) "source\*.o" libyuv_arm.lib
+