diff options
author | Andras Becsi <andras.becsi@digia.com> | 2014-03-18 13:16:26 +0100 |
---|---|---|
committer | Frederik Gladhorn <frederik.gladhorn@digia.com> | 2014-03-20 15:55:39 +0100 |
commit | 3f0f86b0caed75241fa71c95a5d73bc0164348c5 (patch) | |
tree | 92b9fb00f2e9e90b0be2262093876d4f43b6cd13 /chromium/third_party/libyuv | |
parent | e90d7c4b152c56919d963987e2503f9909a666d2 (diff) | |
download | qtwebengine-chromium-3f0f86b0caed75241fa71c95a5d73bc0164348c5.tar.gz |
Update to new stable branch 1750
This also includes an updated ninja and chromium dependencies
needed on Windows.
Change-Id: Icd597d80ed3fa4425933c9f1334c3c2e31291c42
Reviewed-by: Zoltan Arvai <zarvai@inf.u-szeged.hu>
Reviewed-by: Zeno Albisser <zeno.albisser@digia.com>
Diffstat (limited to 'chromium/third_party/libyuv')
66 files changed, 12329 insertions, 7506 deletions
diff --git a/chromium/third_party/libyuv/Android.mk b/chromium/third_party/libyuv/Android.mk index 513a1961b5c..3d8ba49a318 100644 --- a/chromium/third_party/libyuv/Android.mk +++ b/chromium/third_party/libyuv/Android.mk @@ -27,7 +27,9 @@ LOCAL_SRC_FILES := \ source/row_posix.cc \ source/scale.cc \ source/scale_argb.cc \ + source/scale_common.cc \ source/scale_mips.cc \ + source/scale_posix.cc \ source/video_common.cc # TODO(fbarchard): Enable mjpeg encoder. @@ -41,7 +43,6 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) source/compare_neon.cc.neon \ source/rotate_neon.cc.neon \ source/row_neon.cc.neon \ - source/scale_argb_neon.cc.neon \ source/scale_neon.cc.neon endif diff --git a/chromium/third_party/libyuv/DEPS b/chromium/third_party/libyuv/DEPS index eafc459c3f3..7e866873c45 100644 --- a/chromium/third_party/libyuv/DEPS +++ b/chromium/third_party/libyuv/DEPS @@ -13,7 +13,7 @@ vars = { "googlecode_url": "http://%s.googlecode.com/svn", "chromium_trunk" : "http://src.chromium.org/svn/trunk", # chrome://version/ for revision of canary Chrome. - "chromium_revision": "202548", + "chromium_revision": "232627", } # NOTE: Prefer revision numbers to tags for svn deps. Use http rather than @@ -78,6 +78,26 @@ deps_os = { "third_party/gold": From("chromium_deps", "src/third_party/gold"), }, + "android": { + "third_party/android_tools": + From("chromium_deps", "src/third_party/android_tools"), + }, + "ios": { + # NSS, for SSLClientSocketNSS. + "third_party/nss": + From("chromium_deps", "src/third_party/nss"), + + "net/third_party/nss": + Var("chromium_trunk") + "/src/net/third_party/nss@" + Var("chromium_revision"), + + # class-dump utility to generate header files for undocumented SDKs. + "testing/iossim/third_party/class-dump": + From("chromium_deps", "src/testing/iossim/third_party/class-dump"), + + # Helper for running under the simulator. + "testing/iossim": + Var("chromium_trunk") + "/src/testing/iossim@" + Var("chromium_revision"), + }, } hooks = [ @@ -92,7 +112,7 @@ hooks = [ # A change to a .gyp, .gypi, or to GYP itself should run the generator. "pattern": ".", "action": ["python", Var("root_dir") + "/build/gyp_chromium", - "--depth=" + Var("root_dir"), Var("root_dir") + "/libyuv_test.gyp", + "--depth=" + Var("root_dir"), Var("root_dir") + "/all.gyp", Var("extra_gyp_flag")], }, { diff --git a/chromium/third_party/libyuv/OWNERS b/chromium/third_party/libyuv/OWNERS index cbe985ecfdd..df673dfa5e5 100644 --- a/chromium/third_party/libyuv/OWNERS +++ b/chromium/third_party/libyuv/OWNERS @@ -1,3 +1,2 @@ fbarchard@chromium.org
mflodman@chromium.org
-
diff --git a/chromium/third_party/libyuv/README.chromium b/chromium/third_party/libyuv/README.chromium index edc5d82ba88..f11363cc425 100644 --- a/chromium/third_party/libyuv/README.chromium +++ b/chromium/third_party/libyuv/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 723 +Version: 911 License: BSD License File: LICENSE diff --git a/chromium/third_party/libyuv/all.gyp b/chromium/third_party/libyuv/all.gyp new file mode 100644 index 00000000000..cc72d9d6fc3 --- /dev/null +++ b/chromium/third_party/libyuv/all.gyp @@ -0,0 +1,21 @@ +# Copyright 2013 The LibYuv Project Authors. All rights reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. + +# all.gyp and All target are for benefit of android gyp build. +{ + 'targets': [ + { + 'target_name': 'All', + 'type': 'none', + 'dependencies': [ + 'libyuv.gyp:*', + 'libyuv_test.gyp:*', + ], + }, + ], +} diff --git a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h index be3bba44433..f0343a77d3e 100644 --- a/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h +++ b/chromium/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -1,168 +1,168 @@ -/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
-#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// Copy ARGB to ARGB.
-#define ARGBToARGB ARGBCopy
-LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To BGRA. (alias)
-#define ARGBToBGRA BGRAToARGB
-LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To ABGR. (alias)
-#define ARGBToABGR ABGRToARGB
-LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To RGBA.
-LIBYUV_API
-int ARGBToRGBA(const uint8* src_frame, int src_stride_frame,
- uint8* dst_argb, int dst_stride_argb,
- int width, int height);
-
-// Convert ARGB To RGB24.
-LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb24, int dst_stride_rgb24,
- int width, int height);
-
-// Convert ARGB To RAW.
-LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb, int dst_stride_rgb,
- int width, int height);
-
-// Convert ARGB To RGB565.
-LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
- uint8* dst_rgb565, int dst_stride_rgb565,
- int width, int height);
-
-// Convert ARGB To ARGB1555.
-LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb1555, int dst_stride_argb1555,
- int width, int height);
-
-// Convert ARGB To ARGB4444.
-LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_argb4444, int dst_stride_argb4444,
- int width, int height);
-
-// Convert ARGB To I444.
-LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB To I422.
-LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB To I420. (also in convert.h)
-LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB to J420. (JPeg full range I420).
-LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB To I411.
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_u, int dst_stride_u,
- uint8* dst_v, int dst_stride_v,
- int width, int height);
-
-// Convert ARGB to J400. (JPeg full range).
-LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yj, int dst_stride_yj,
- int width, int height);
-
-// Convert ARGB to I400.
-LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- int width, int height);
-
-// Convert ARGB To NV12.
-LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_uv, int dst_stride_uv,
- int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height);
-
-// Convert ARGB To NV21.
-LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
- uint8* dst_y, int dst_stride_y,
- uint8* dst_vu, int dst_stride_vu,
- int width, int height);
-
-// Convert ARGB To YUY2.
-LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
- uint8* dst_yuy2, int dst_stride_yuy2,
- int width, int height);
-
-// Convert ARGB To UYVY.
-LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
- uint8* dst_uyvy, int dst_stride_uyvy,
- int width, int height);
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
-
-#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
+/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT +#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Copy ARGB to ARGB. +#define ARGBToARGB ARGBCopy +LIBYUV_API +int ARGBCopy(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To BGRA. (alias) +#define ARGBToBGRA BGRAToARGB +LIBYUV_API +int BGRAToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To ABGR. (alias) +#define ARGBToABGR ABGRToARGB +LIBYUV_API +int ABGRToARGB(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To RGBA. +LIBYUV_API +int ARGBToRGBA(const uint8* src_frame, int src_stride_frame, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert ARGB To RGB24. +LIBYUV_API +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height); + +// Convert ARGB To RAW. +LIBYUV_API +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + +// Convert ARGB To RGB565. +LIBYUV_API +int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert ARGB To ARGB1555. +LIBYUV_API +int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb1555, int dst_stride_argb1555, + int width, int height); + +// Convert ARGB To ARGB4444. +LIBYUV_API +int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb4444, int dst_stride_argb4444, + int width, int height); + +// Convert ARGB To I444. +LIBYUV_API +int ARGBToI444(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I422. +LIBYUV_API +int ARGBToI422(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I420. (also in convert.h) +LIBYUV_API +int ARGBToI420(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J420. (JPeg full range I420). +LIBYUV_API +int ARGBToJ420(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB To I411. +LIBYUV_API +int ARGBToI411(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int width, int height); + +// Convert ARGB to J400. (JPeg full range). +LIBYUV_API +int ARGBToJ400(const uint8* src_argb, int src_stride_argb, + uint8* dst_yj, int dst_stride_yj, + int width, int height); + +// Convert ARGB to I400. +LIBYUV_API +int ARGBToI400(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + +// Convert ARGB To NV12. +LIBYUV_API +int ARGBToNV12(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To NV21. +LIBYUV_API +int ARGBToNV21(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + uint8* dst_vu, int dst_stride_vu, + int width, int height); + +// Convert ARGB To YUY2. +LIBYUV_API +int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, + uint8* dst_yuy2, int dst_stride_yuy2, + int width, int height); + +// Convert ARGB To UYVY. +LIBYUV_API +int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, + uint8* dst_uyvy, int dst_stride_uyvy, + int width, int height); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT diff --git a/chromium/third_party/libyuv/include/libyuv/cpu_id.h b/chromium/third_party/libyuv/include/libyuv/cpu_id.h index 8b6d043222b..79da994c744 100644 --- a/chromium/third_party/libyuv/include/libyuv/cpu_id.h +++ b/chromium/third_party/libyuv/include/libyuv/cpu_id.h @@ -18,6 +18,7 @@ namespace libyuv { extern "C" { #endif +// TODO(fbarchard): Consider overlapping bits for different architectures. // Internal flag to indicate cpuid requires initialization. static const int kCpuInit = 0x1; @@ -35,11 +36,13 @@ static const int kCpuHasSSE42 = 0x100; static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; +static const int kCpuHasFMA3 = 0x1000; +// 0x2000, 0x4000, 0x8000 reserved for future X86 flags. // These flags are only valid on MIPS processors. -static const int kCpuHasMIPS = 0x1000; -static const int kCpuHasMIPS_DSP = 0x2000; -static const int kCpuHasMIPS_DSPR2 = 0x4000; +static const int kCpuHasMIPS = 0x10000; +static const int kCpuHasMIPS_DSP = 0x20000; +static const int kCpuHasMIPS_DSPR2 = 0x40000; // Internal function used to auto-init. LIBYUV_API @@ -65,8 +68,10 @@ LIBYUV_API void MaskCpuFlags(int enable_flags); // Low level cpuid for X86. Returns zeros on other CPUs. +// eax is the info type that you want. +// ecx is typically the cpu number, and should normally be zero. LIBYUV_API -void CpuId(int cpu_info[4], int info_type); +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info); #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h index e53c1fe1e2e..7bb82fce146 100644 --- a/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h +++ b/chromium/third_party/libyuv/include/libyuv/mjpeg_decoder.h @@ -45,7 +45,7 @@ struct SetJmpErrorMgr; // MJPEG frames. // // See http://tools.ietf.org/html/rfc2435 -class MJpegDecoder { +class LIBYUV_API MJpegDecoder { public: typedef void (*CallbackFunction)(void* opaque, const uint8* const* data, diff --git a/chromium/third_party/libyuv/include/libyuv/planar_functions.h b/chromium/third_party/libyuv/include/libyuv/planar_functions.h index cb14678a8b3..1d54ddec147 100644 --- a/chromium/third_party/libyuv/include/libyuv/planar_functions.h +++ b/chromium/third_party/libyuv/include/libyuv/planar_functions.h @@ -72,6 +72,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, int width, int height); // Convert UYVY to I422. +LIBYUV_API int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -187,14 +188,27 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, int x, int y, int width, int height); // Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int width, int height); + +// Deprecated. Use ARGBColorMatrix instead. +// Apply a matrix rotation to each ARGB pixel. // matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. // The first 4 coefficients apply to B, G, R, A and produce B of the output. // The next 4 coefficients apply to B, G, R, A and produce G of the output. // The last 4 coefficients apply to B, G, R, A and produce R of the output. LIBYUV_API -int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, - const int8* matrix_argb, - int x, int y, int width, int height); +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int x, int y, int width, int height); // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. @@ -203,6 +217,36 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, const uint8* table_argb, int x, int y, int width, int height); +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + +// Apply a luma/color table each ARGB pixel but preserve destination alpha. +// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from +// RGB (YJ style) and C is an 8 bit color component (R, G or B). +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma_rgb_table, + int width, int height); + +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height); + // Quantize a rectangle of ARGB. Alpha unaffected. // scale is a 16 bit fractional fixed point scaler between 0 and 65535. // interval_size should be a value between 1 and 255. @@ -218,6 +262,18 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Copy ARGB to ARGB. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); @@ -288,6 +344,7 @@ int MJPGToARGB(const uint8* sample, size_t sample_size, uint8* argb, int argb_stride, int w, int h, int dw, int dh); +// Internal function - do not call directly. // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API @@ -296,8 +353,11 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, int width, int height); // Blur ARGB image. -// Caller should allocate dst_cumsum table of width * height * 16 bytes aligned -// to 16 byte boundary. +// dst_cumsum table of width * (height + 1) * 16 bytes aligned to +// 16 byte boundary. +// dst_stride32_cumsum is number of ints in a row (width * 4). +// radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. LIBYUV_API int ARGBBlur(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, @@ -347,6 +407,12 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, uint8* dst_argb, int dst_stride_argb, const uint8* shuffler, int width, int height); +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height); + // Sobel ARGB effect. LIBYUV_API int ARGBSobel(const uint8* src_argb, int src_stride_argb, diff --git a/chromium/third_party/libyuv/include/libyuv/row.h b/chromium/third_party/libyuv/include/libyuv/row.h index 3416661742f..b6056fdca9b 100644 --- a/chromium/third_party/libyuv/include/libyuv/row.h +++ b/chromium/third_party/libyuv/include/libyuv/row.h @@ -38,32 +38,66 @@ extern "C" { // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -// Conversions. +// Effects: +#define HAS_ARGBADDROW_SSE2 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_ARGBCOPYALPHAROW_SSE2 +#define HAS_ARGBCOPYYTOALPHAROW_SSE2 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBMIRRORROW_SSSE3 +#define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADEROW_SSE2 +#define HAS_ARGBSUBTRACTROW_SSE2 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ARGBUNATTENUATEROW_SSE2 +#define HAS_COMPUTECUMULATIVESUMROW_SSE2 +#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_INTERPOLATEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 +#define HAS_RGBCOLORTABLEROW_X86 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELTOPLANEROW_SSE2 +#define HAS_SOBELXROW_SSE2 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSE2 + +// Conversions: #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 +#define HAS_ARGBTOBAYERGGROW_SSE2 #define HAS_ARGBTOBAYERROW_SSSE3 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 -#define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 +#define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 +#define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 -#define HAS_COPYROW_ERMS +#define HAS_FIXEDDIV_X86 #define HAS_HALFROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 @@ -77,7 +111,9 @@ extern "C" { #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 +#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSSE3 +#define HAS_MIRRORROW_UV_SSSE3 #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 @@ -101,43 +137,48 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 - -// Effects -#define HAS_ARGBADDROW_SSE2 -#define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBATTENUATEROW_SSSE3 -#define HAS_ARGBBLENDROW_SSSE3 -#define HAS_ARGBCOLORMATRIXROW_SSSE3 -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBMIRRORROW_SSSE3 -#define HAS_ARGBMULTIPLYROW_SSE2 -#define HAS_ARGBQUANTIZEROW_SSE2 -#define HAS_ARGBSEPIAROW_SSSE3 -#define HAS_ARGBSHADEROW_SSE2 -#define HAS_ARGBSUBTRACTROW_SSE2 -#define HAS_ARGBUNATTENUATEROW_SSE2 -#define HAS_COMPUTECUMULATIVESUMROW_SSE2 -#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -#define HAS_INTERPOLATEROW_SSE2 -#define HAS_INTERPOLATEROW_SSSE3 -#define HAS_SOBELROW_SSE2 -#define HAS_SOBELXROW_SSSE3 -#define HAS_SOBELXYROW_SSE2 -#define HAS_SOBELYROW_SSSE3 #endif -// The following are Windows only. -// TODO(fbarchard): Port to gcc. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_ARGBCOLORTABLEROW_X86 +// GCC >= 4.7.0 required for AVX2. +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) +#define GCC_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + // Visual C 2012 required for AVX2. -#if _MSC_VER >= 1700 +#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// The following are available on all x86 platforms, but +// require VS2012, clang 3.4 or gcc 4.7. +// The code supports NaCL but requires a new compiler and validator. +#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \ + defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +// Effects: +#define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 +#define HAS_ARGBCOPYALPHAROW_AVX2 +#define HAS_ARGBCOPYYTOALPHAROW_AVX2 +#endif + +// The following are require VS2012. +// TODO(fbarchard): Port to gcc. +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 #define HAS_HALFROW_AVX2 #define HAS_I422TOARGBROW_AVX2 +#define HAS_INTERPOLATEROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_SPLITUVROW_AVX2 @@ -148,17 +189,16 @@ extern "C" { #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 -// Effects +// Effects: #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 -#endif -#endif +#endif // defined(VISUALC_HAS_AVX2) -// The following are Yasm x86 only. +// The following are Yasm x86 only: // TODO(fbarchard): Port AVX2 to inline. #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) (defined(_M_IX86) || defined(_M_X64) || \ @@ -177,12 +217,12 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_SSSE3_ONLY) -#define HAS_ARGBATTENUATEROW_SSE2 #define HAS_ARGBBLENDROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSE2 #define HAS_MIRRORROW_SSE2 #endif -// The following are available on Neon platforms +// The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOUVROW_NEON @@ -255,7 +295,7 @@ extern "C" { #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON -// Effects +// Effects: #define HAS_ARGBADDROW_NEON #define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBBLENDROW_NEON @@ -268,13 +308,14 @@ extern "C" { #define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON #define HAS_SOBELROW_NEON +#define HAS_SOBELTOPLANEROW_NEON #define HAS_SOBELXYROW_NEON #define HAS_SOBELXROW_NEON #define HAS_SOBELYROW_NEON #define HAS_INTERPOLATEROW_NEON #endif -// The following are available on Mips platforms +// The following are available on Mips platforms: #if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) #define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) @@ -304,6 +345,7 @@ typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint8 ulvec8[32]; #elif defined(__GNUC__) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) typedef int16 __attribute__((vector_size(16))) vec16; typedef int32 __attribute__((vector_size(16))) vec32; @@ -327,6 +369,14 @@ typedef uint8 uvec8[16]; #define OMITFP __attribute__((optimize("omit-frame-pointer"))) #endif +// For functions that use rowbuffer and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#define SAFEBUFFERS __declspec(safebuffers) +#else +#define SAFEBUFFERS +#endif + void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -655,6 +705,14 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_MIPS(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count); +void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); + +void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); + void SetRow_X86(uint8* dst, uint32 v32, int count); void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, int dst_stride, int height); @@ -668,6 +726,8 @@ void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, // ARGBShufflers for BGRAToARGB etc. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, @@ -676,6 +736,8 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); +void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix); void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix); void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, @@ -1338,8 +1400,16 @@ void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix); void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix); +void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 /* selector */, int pix); +void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); +void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, @@ -1398,7 +1468,7 @@ void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, int width); // Inverse table for unattenuate, shared by C and SSE2. -extern uint32 fixed_invtbl8[256]; +extern const uint32 fixed_invtbl8[256]; void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); @@ -1415,15 +1485,19 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); void ARGBSepiaRow_NEON(uint8* dst_argb, int width); -void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width); -void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, - int width); -void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, - int width); +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); + void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width); void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, @@ -1466,6 +1540,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); @@ -1487,6 +1564,9 @@ void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); +void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); @@ -1494,14 +1574,14 @@ void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, // Sobel images. void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width); -void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width); void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width); -void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width); +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width); void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, @@ -1510,6 +1590,12 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, @@ -1517,6 +1603,31 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, const uint32 lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, const uint8* luma, + const uint32 lumacoeff); + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div); +int FixedDiv_X86(int num, int div); +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 +#else +#define FixedDiv FixedDiv_C +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/include/libyuv/scale.h b/chromium/third_party/libyuv/include/libyuv/scale.h index b1efc95d2fd..b672dbfcee8 100644 --- a/chromium/third_party/libyuv/include/libyuv/scale.h +++ b/chromium/third_party/libyuv/include/libyuv/scale.h @@ -18,11 +18,12 @@ namespace libyuv { extern "C" { #endif -// Supported filtering +// Supported filtering. enum FilterMode { kFilterNone = 0, // Point sample; Fastest. - kFilterBilinear = 1, // Faster than box, but lower quality scaling down. - kFilterBox = 2 // Highest quality. + kFilterLinear = 1, // Filter horizontally only. + kFilterBilinear = 2, // Faster than box, but lower quality scaling down. + kFilterBox = 3 // Highest quality. }; // Scale a YUV plane. diff --git a/chromium/third_party/libyuv/include/libyuv/scale_argb.h b/chromium/third_party/libyuv/include/libyuv/scale_argb.h index b6f510522e7..0c9b3625757 100644 --- a/chromium/third_party/libyuv/include/libyuv/scale_argb.h +++ b/chromium/third_party/libyuv/include/libyuv/scale_argb.h @@ -35,6 +35,20 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering); +// TODO(fbarchard): Implement this. +// Scale with YUV conversion to ARGB and clipping. +LIBYUV_API +int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint32 src_fourcc, + int src_width, int src_height, + uint8* dst_argb, int dst_stride_argb, + uint32 dst_fourcc, + int dst_width, int dst_height, + int clip_x, int clip_y, int clip_width, int clip_height, + enum FilterMode filtering); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/include/libyuv/scale_row.h b/chromium/third_party/libyuv/include/libyuv/scale_row.h new file mode 100644 index 00000000000..23c4e90791f --- /dev/null +++ b/chromium/third_party/libyuv/include/libyuv/scale_row.h @@ -0,0 +1,273 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_SCALE_ROW_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_SCALEROWDOWN2_SSE2 +#define HAS_SCALEROWDOWN4_SSE2 +#define HAS_SCALEROWDOWN34_SSSE3 +#define HAS_SCALEROWDOWN38_SSSE3 +#define HAS_SCALEADDROWS_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEARGBROWDOWN2_SSE2 +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +#define HAS_SCALEARGBCOLS_SSE2 +#define HAS_SCALEARGBFILTERCOLS_SSSE3 +#define HAS_SCALEARGBCOLSUP2_SSE2 +#endif + +// The following are available on Neon platforms: +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#define HAS_SCALEROWDOWN2_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEROWDOWN34_NEON +#define HAS_SCALEROWDOWN38_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#endif + +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) +#define HAS_SCALEROWDOWN2_MIPS_DSPR2 +#define HAS_SCALEROWDOWN4_MIPS_DSPR2 +#define HAS_SCALEROWDOWN34_MIPS_DSPR2 +#define HAS_SCALEROWDOWN38_MIPS_DSPR2 +#endif + +// Scale ARGB vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, FilterMode filtering); + +// Simplify the filtering based on scale factors. +FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + FilterMode filtering); + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + FilterMode filtering, + int* x, int* y, int* dx, int* dy); + +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int, int); +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height); +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int, int); +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height); +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int /* x */, int /* dx */); +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int /* x */, int /* dx */); +// Row functions. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +// ScaleRowDown2Box also used by planar functions +// NEON downscalers with interpolation. + +// Note - not static due to reuse in convert for 444 to 420. +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); + +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +void ScaleRowDown34_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +// 32 -> 12 +void ScaleRowDown38_NEON(const uint8* src_ptr, + ptrdiff_t /* src_stride */, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_SCALE_ROW_H_ NOLINT diff --git a/chromium/third_party/libyuv/include/libyuv/version.h b/chromium/third_party/libyuv/include/libyuv/version.h index 31cf78fc591..3bb834f9448 100644 --- a/chromium/third_party/libyuv/include/libyuv/version.h +++ b/chromium/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 723 +#define LIBYUV_VERSION 911 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/chromium/third_party/libyuv/libyuv.gyp b/chromium/third_party/libyuv/libyuv.gyp index ad6b78b5c3e..4130bd0d3f0 100644 --- a/chromium/third_party/libyuv/libyuv.gyp +++ b/chromium/third_party/libyuv/libyuv.gyp @@ -74,6 +74,7 @@ 'include/libyuv/row.h', 'include/libyuv/scale.h', 'include/libyuv/scale_argb.h', + 'include/libyuv/scale_row.h', 'include/libyuv/version.h', 'include/libyuv/video_common.h', @@ -107,9 +108,11 @@ 'source/row_win.cc', 'source/scale.cc', 'source/scale_argb.cc', - 'source/scale_argb_neon.cc', + 'source/scale_common.cc', 'source/scale_mips.cc', 'source/scale_neon.cc', + 'source/scale_posix.cc', + 'source/scale_win.cc', 'source/video_common.cc', ], }, diff --git a/chromium/third_party/libyuv/libyuv_test.gyp b/chromium/third_party/libyuv/libyuv_test.gyp index 447881a4480..906fc5f8b0d 100644 --- a/chromium/third_party/libyuv/libyuv_test.gyp +++ b/chromium/third_party/libyuv/libyuv_test.gyp @@ -35,6 +35,7 @@ 'unit_test/compare_test.cc', 'unit_test/convert_test.cc', 'unit_test/cpu_test.cc', + 'unit_test/math_test.cc', 'unit_test/planar_test.cc', 'unit_test/rotate_argb_test.cc', 'unit_test/rotate_test.cc', diff --git a/chromium/third_party/libyuv/linux.mk b/chromium/third_party/libyuv/linux.mk new file mode 100644 index 00000000000..5d12135a85d --- /dev/null +++ b/chromium/third_party/libyuv/linux.mk @@ -0,0 +1,48 @@ +# This is a generic makefile for libyuv for gcc. +# make -f linux.mk CC=clang++ + +CC=g++ +CCFLAGS=-O2 -fomit-frame-pointer -Iinclude/ + +LOCAL_OBJ_FILES := \ + source/compare.o \ + source/compare_common.o \ + source/compare_posix.o \ + source/convert.o \ + source/convert_argb.o \ + source/convert_from.o \ + source/convert_from_argb.o \ + source/convert_to_argb.o \ + source/convert_to_i420.o \ + source/cpu_id.o \ + source/format_conversion.o \ + source/planar_functions.o \ + source/rotate.o \ + source/rotate_argb.o \ + source/rotate_mips.o \ + source/row_any.o \ + source/row_common.o \ + source/row_mips.o \ + source/row_posix.o \ + source/scale.o \ + source/scale_argb.o \ + source/scale_common.o \ + source/scale_mips.o \ + source/scale_posix.o \ + source/video_common.o + +.cc.o: + $(CC) -c $(CCFLAGS) $*.cc -o $*.o + +all: libyuv.a convert linux.mk + +libyuv.a: $(LOCAL_OBJ_FILES) linux.mk + $(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES) + +# A test utility that uses libyuv conversion. +convert: util/convert.cc linux.mk + $(CC) $(CCFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a + +clean: + /bin/rm -f source/*.o *.ii *.s libyuv.a convert + diff --git a/chromium/third_party/libyuv/source/compare.cc b/chromium/third_party/libyuv/source/compare.cc index f8b358309e5..7d844ee08a6 100644 --- a/chromium/third_party/libyuv/source/compare.cc +++ b/chromium/third_party/libyuv/source/compare.cc @@ -30,12 +30,17 @@ extern "C" { uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); // This module is for Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \ +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) #define HAS_HASHDJB2_SSE41 - uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); +#if _MSC_VER >= 1700 +#define HAS_HASHDJB2_AVX2 +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); +#endif + #endif // HAS_HASHDJB2_SSE41 // hash seed of 5381 recommended. @@ -47,6 +52,11 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { HashDjb2_SSE = HashDjb2_SSE41; } #endif +#if defined(HAS_HASHDJB2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + HashDjb2_SSE = HashDjb2_AVX2; + } +#endif const int kBlockSize = 1 << 15; // 32768; while (count >= static_cast<uint64>(kBlockSize)) { @@ -73,8 +83,8 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); #define HAS_SUMSQUAREERROR_NEON uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); #endif -#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \ - defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #define HAS_SUMSQUAREERROR_SSE2 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); #endif @@ -138,7 +148,9 @@ LIBYUV_API uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, const uint8* src_b, int stride_b, int width, int height) { - if (stride_a == width && stride_b == width) { + // Coalesce rows. + if (stride_a == width && + stride_b == width) { return ComputeSumSquareError(src_a, src_b, width * height); } uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = diff --git a/chromium/third_party/libyuv/source/compare_common.cc b/chromium/third_party/libyuv/source/compare_common.cc index ab587d08171..3e4c77a67fe 100644 --- a/chromium/third_party/libyuv/source/compare_common.cc +++ b/chromium/third_party/libyuv/source/compare_common.cc @@ -1,40 +1,40 @@ -/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse = 0u;
- for (int i = 0; i < count; ++i) {
- int diff = src_a[i] - src_b[i];
- sse += static_cast<uint32>(diff * diff);
- }
- return sse;
-}
-
-// hash seed of 5381 recommended.
-// Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
- uint32 hash = seed;
- for (int i = 0; i < count; ++i) {
- hash += (hash << 5) + src[i];
- }
- return hash;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse = 0u; + for (int i = 0; i < count; ++i) { + int diff = src_a[i] - src_b[i]; + sse += static_cast<uint32>(diff * diff); + } + return sse; +} + +// hash seed of 5381 recommended. +// Internal C version of HashDjb2 with int sized count for efficiency. +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) { + uint32 hash = seed; + for (int i = 0; i < count; ++i) { + hash += (hash << 5) + src[i]; + } + return hash; +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/chromium/third_party/libyuv/source/compare_neon.cc b/chromium/third_party/libyuv/source/compare_neon.cc index a4e77750631..c377c163474 100644 --- a/chromium/third_party/libyuv/source/compare_neon.cc +++ b/chromium/third_party/libyuv/source/compare_neon.cc @@ -1,61 +1,61 @@ -/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
-
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
- volatile uint32 sse;
- asm volatile (
- "vmov.u8 q8, #0 \n"
- "vmov.u8 q10, #0 \n"
- "vmov.u8 q9, #0 \n"
- "vmov.u8 q11, #0 \n"
-
- ".p2align 2 \n"
- "1: \n"
- "vld1.8 {q0}, [%0]! \n"
- "vld1.8 {q1}, [%1]! \n"
- "subs %2, %2, #16 \n"
- "vsubl.u8 q2, d0, d2 \n"
- "vsubl.u8 q3, d1, d3 \n"
- "vmlal.s16 q8, d4, d4 \n"
- "vmlal.s16 q9, d6, d6 \n"
- "vmlal.s16 q10, d5, d5 \n"
- "vmlal.s16 q11, d7, d7 \n"
- "bgt 1b \n"
-
- "vadd.u32 q8, q8, q9 \n"
- "vadd.u32 q10, q10, q11 \n"
- "vadd.u32 q11, q8, q10 \n"
- "vpaddl.u32 q1, q11 \n"
- "vadd.u64 d0, d2, d3 \n"
- "vmov.32 %3, d0[0] \n"
- : "+r"(src_a),
- "+r"(src_b),
- "+r"(count),
- "=r"(sse)
- :
- : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
- return sse;
-}
-
-#endif // __ARM_NEON__
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) + +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { + volatile uint32 sse; + asm volatile ( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" + + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" + + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), + "+r"(src_b), + "+r"(count), + "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + return sse; +} + +#endif // __ARM_NEON__ + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/chromium/third_party/libyuv/source/compare_posix.cc b/chromium/third_party/libyuv/source/compare_posix.cc index f24835d7714..1e0ba8fe156 100644 --- a/chromium/third_party/libyuv/source/compare_posix.cc +++ b/chromium/third_party/libyuv/source/compare_posix.cc @@ -1,164 +1,166 @@ -/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
-
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
- uint32 sse;
- asm volatile (
- "pxor %%xmm0,%%xmm0 \n"
- "pxor %%xmm5,%%xmm5 \n"
- "sub %0,%1 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqa (%0),%%xmm1 \n"
- "movdqa (%0,%1,1),%%xmm2 \n"
- "lea 0x10(%0),%0 \n"
- "sub $0x10,%2 \n"
- "movdqa %%xmm1,%%xmm3 \n"
- "psubusb %%xmm2,%%xmm1 \n"
- "psubusb %%xmm3,%%xmm2 \n"
- "por %%xmm2,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm5,%%xmm1 \n"
- "punpckhbw %%xmm5,%%xmm2 \n"
- "pmaddwd %%xmm1,%%xmm1 \n"
- "pmaddwd %%xmm2,%%xmm2 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "paddd %%xmm2,%%xmm0 \n"
- "jg 1b \n"
-
- "pshufd $0xee,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "pshufd $0x1,%%xmm0,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "movd %%xmm0,%3 \n"
-
- : "+r"(src_a), // %0
- "+r"(src_b), // %1
- "+r"(count), // %2
- "=g"(sse) // %3
- :
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-#endif
- );
- return sse;
-}
-
-#endif // defined(__x86_64__) || defined(__i386__)
-
-#if !defined(LIBYUV_DISABLE_X86) && \
- (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-// GCC 4.2 on OSX has link error when passing static or const to inline.
-// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
-#ifdef __APPLE__
-#define CONST
-#else
-#define CONST static const
-#endif
-#define HAS_HASHDJB2_SSE41
-CONST uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-CONST uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-CONST uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-CONST uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-CONST uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- uint32 hash;
- asm volatile (
- "movd %2,%%xmm0 \n"
- "pxor %%xmm7,%%xmm7 \n"
- "movdqa %4,%%xmm6 \n"
- ".p2align 4 \n"
- "1: \n"
- "movdqu (%0),%%xmm1 \n"
- "lea 0x10(%0),%0 \n"
- "pmulld %%xmm6,%%xmm0 \n"
- "movdqa %5,%%xmm5 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklbw %%xmm7,%%xmm2 \n"
- "movdqa %%xmm2,%%xmm3 \n"
- "punpcklwd %%xmm7,%%xmm3 \n"
- "pmulld %%xmm5,%%xmm3 \n"
- "movdqa %6,%%xmm5 \n"
- "movdqa %%xmm2,%%xmm4 \n"
- "punpckhwd %%xmm7,%%xmm4 \n"
- "pmulld %%xmm5,%%xmm4 \n"
- "movdqa %7,%%xmm5 \n"
- "punpckhbw %%xmm7,%%xmm1 \n"
- "movdqa %%xmm1,%%xmm2 \n"
- "punpcklwd %%xmm7,%%xmm2 \n"
- "pmulld %%xmm5,%%xmm2 \n"
- "movdqa %8,%%xmm5 \n"
- "punpckhwd %%xmm7,%%xmm1 \n"
- "pmulld %%xmm5,%%xmm1 \n"
- "paddd %%xmm4,%%xmm3 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "sub $0x10,%1 \n"
- "paddd %%xmm3,%%xmm1 \n"
- "pshufd $0xe,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "pshufd $0x1,%%xmm1,%%xmm2 \n"
- "paddd %%xmm2,%%xmm1 \n"
- "paddd %%xmm1,%%xmm0 \n"
- "jg 1b \n"
- "movd %%xmm0,%3 \n"
- : "+r"(src), // %0
- "+r"(count), // %1
- "+rm"(seed), // %2
- "=g"(hash) // %3
- : "m"(kHash16x33), // %4
- "m"(kHashMul0), // %5
- "m"(kHashMul1), // %6
- "m"(kHashMul2), // %7
- "m"(kHashMul3) // %8
- : "memory", "cc"
-#if defined(__SSE2__)
- , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-#endif
- );
- return hash;
-}
-#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if defined(__native_client__) && defined(__x86_64__) +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" +#else +#define MEMACCESS(base) "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" +#endif + +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + uint32 sse; + asm volatile ( // NOLINT + "pxor %%xmm0,%%xmm0 \n" + "pxor %%xmm5,%%xmm5 \n" + ".p2align 2 \n" + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "movdqa " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10, 1) ",%1 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psubusb %%xmm2,%%xmm1 \n" + "psubusb %%xmm3,%%xmm2 \n" + "por %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpckhbw %%xmm5,%%xmm2 \n" + "pmaddwd %%xmm1,%%xmm1 \n" + "pmaddwd %%xmm2,%%xmm2 \n" + "paddd %%xmm1,%%xmm0 \n" + "paddd %%xmm2,%%xmm0 \n" + "jg 1b \n" + + "pshufd $0xee,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "pshufd $0x1,%%xmm0,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "movd %%xmm0,%3 \n" + + : "+r"(src_a), // %0 + "+r"(src_b), // %1 + "+r"(count), // %2 + "=g"(sse) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); // NOLINT + return sse; +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + uint32 hash; + asm volatile ( // NOLINT + "movd %2,%%xmm0 \n" + "pxor %%xmm7,%%xmm7 \n" + "movdqa %4,%%xmm6 \n" + ".p2align 2 \n" + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "lea " MEMLEA(0x10, 0) ",%0 \n" + "pmulld %%xmm6,%%xmm0 \n" + "movdqa %5,%%xmm5 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm7,%%xmm3 \n" + "pmulld %%xmm5,%%xmm3 \n" + "movdqa %6,%%xmm5 \n" + "movdqa %%xmm2,%%xmm4 \n" + "punpckhwd %%xmm7,%%xmm4 \n" + "pmulld %%xmm5,%%xmm4 \n" + "movdqa %7,%%xmm5 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "movdqa %%xmm1,%%xmm2 \n" + "punpcklwd %%xmm7,%%xmm2 \n" + "pmulld %%xmm5,%%xmm2 \n" + "movdqa %8,%%xmm5 \n" + "punpckhwd %%xmm7,%%xmm1 \n" + "pmulld %%xmm5,%%xmm1 \n" + "paddd %%xmm4,%%xmm3 \n" + "paddd %%xmm2,%%xmm1 \n" + "sub $0x10,%1 \n" + "paddd %%xmm3,%%xmm1 \n" + "pshufd $0xe,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "pshufd $0x1,%%xmm1,%%xmm2 \n" + "paddd %%xmm2,%%xmm1 \n" + "paddd %%xmm1,%%xmm0 \n" + "jg 1b \n" + "movd %%xmm0,%3 \n" + : "+r"(src), // %0 + "+r"(count), // %1 + "+rm"(seed), // %2 + "=g"(hash) // %3 + : "m"(kHash16x33), // %4 + "m"(kHashMul0), // %5 + "m"(kHashMul1), // %6 + "m"(kHashMul2), // %7 + "m"(kHashMul3) // %8 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); // NOLINT + return hash; +} +#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/chromium/third_party/libyuv/source/compare_win.cc b/chromium/third_party/libyuv/source/compare_win.cc index e576e85c192..99831651f5f 100644 --- a/chromium/third_party/libyuv/source/compare_win.cc +++ b/chromium/third_party/libyuv/source/compare_win.cc @@ -1,192 +1,232 @@ -/*
- * Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
- __asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
- pxor xmm0, xmm0
- pxor xmm5, xmm5
- sub edx, eax
-
- align 16
- wloop:
- movdqa xmm1, [eax]
- movdqa xmm2, [eax + edx]
- lea eax, [eax + 16]
- sub ecx, 16
- movdqa xmm3, xmm1 // abs trick
- psubusb xmm1, xmm2
- psubusb xmm2, xmm3
- por xmm1, xmm2
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm0, xmm1
- paddd xmm0, xmm2
- jg wloop
-
- pshufd xmm1, xmm0, 0xee
- paddd xmm0, xmm1
- pshufd xmm1, xmm0, 0x01
- paddd xmm0, xmm1
- movd eax, xmm0
- ret
- }
-}
-
-// Visual C 2012 required for AVX2.
-#if _MSC_VER >= 1700
-// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked) __declspec(align(16))
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
- __asm {
- mov eax, [esp + 4] // src_a
- mov edx, [esp + 8] // src_b
- mov ecx, [esp + 12] // count
- vpxor ymm0, ymm0, ymm0 // sum
- vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
- sub edx, eax
-
- align 16
- wloop:
- vmovdqu ymm1, [eax]
- vmovdqu ymm2, [eax + edx]
- lea eax, [eax + 32]
- sub ecx, 32
- vpsubusb ymm3, ymm1, ymm2 // abs difference trick
- vpsubusb ymm2, ymm2, ymm1
- vpor ymm1, ymm2, ymm3
- vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
- vpunpckhbw ymm1, ymm1, ymm5
- vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
- vpmaddwd ymm1, ymm1, ymm1
- vpaddd ymm0, ymm0, ymm1
- vpaddd ymm0, ymm0, ymm2
- jg wloop
-
- vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
- vpaddd ymm0, ymm0, ymm1
- vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
- vpaddd ymm0, ymm0, ymm1
- vpermq ymm1, ymm0, 0x02 // high + low lane.
- vpaddd ymm0, ymm0, ymm1
- vmovd eax, xmm0
- vzeroupper
- ret
- }
-}
-#endif // _MSC_VER >= 1700
-
-#define HAS_HASHDJB2_SSE41
-static const uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
-static const uvec32 kHashMul0 = {
- 0x0c3525e1, // 33 ^ 15
- 0xa3476dc1, // 33 ^ 14
- 0x3b4039a1, // 33 ^ 13
- 0x4f5f0981, // 33 ^ 12
-};
-static const uvec32 kHashMul1 = {
- 0x30f35d61, // 33 ^ 11
- 0x855cb541, // 33 ^ 10
- 0x040a9121, // 33 ^ 9
- 0x747c7101, // 33 ^ 8
-};
-static const uvec32 kHashMul2 = {
- 0xec41d4e1, // 33 ^ 7
- 0x4cfa3cc1, // 33 ^ 6
- 0x025528a1, // 33 ^ 5
- 0x00121881, // 33 ^ 4
-};
-static const uvec32 kHashMul3 = {
- 0x00008c61, // 33 ^ 3
- 0x00000441, // 33 ^ 2
- 0x00000021, // 33 ^ 1
- 0x00000001, // 33 ^ 0
-};
-
-// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
-// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
-// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
-// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
-// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
-#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
- _asm _emit 0x40 _asm _emit reg
-
-__declspec(naked) __declspec(align(16))
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
- __asm {
- mov eax, [esp + 4] // src
- mov ecx, [esp + 8] // count
- movd xmm0, [esp + 12] // seed
-
- pxor xmm7, xmm7 // constant 0 for unpck
- movdqa xmm6, kHash16x33
-
- align 16
- wloop:
- movdqu xmm1, [eax] // src[0-15]
- lea eax, [eax + 16]
- pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
- movdqa xmm5, kHashMul0
- movdqa xmm2, xmm1
- punpcklbw xmm2, xmm7 // src[0-7]
- movdqa xmm3, xmm2
- punpcklwd xmm3, xmm7 // src[0-3]
- pmulld(0xdd) // pmulld xmm3, xmm5
- movdqa xmm5, kHashMul1
- movdqa xmm4, xmm2
- punpckhwd xmm4, xmm7 // src[4-7]
- pmulld(0xe5) // pmulld xmm4, xmm5
- movdqa xmm5, kHashMul2
- punpckhbw xmm1, xmm7 // src[8-15]
- movdqa xmm2, xmm1
- punpcklwd xmm2, xmm7 // src[8-11]
- pmulld(0xd5) // pmulld xmm2, xmm5
- movdqa xmm5, kHashMul3
- punpckhwd xmm1, xmm7 // src[12-15]
- pmulld(0xcd) // pmulld xmm1, xmm5
- paddd xmm3, xmm4 // add 16 results
- paddd xmm1, xmm2
- sub ecx, 16
- paddd xmm1, xmm3
-
- pshufd xmm2, xmm1, 0x0e // upper 2 dwords
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0x01
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- jg wloop
-
- movd eax, xmm0 // return hash
- ret
- }
-}
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
-
-#ifdef __cplusplus
-} // extern "C"
-} // namespace libyuv
-#endif
+/* + * Copyright 2012 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/basic_types.h" +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +__declspec(naked) __declspec(align(16)) +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + pxor xmm0, xmm0 + pxor xmm5, xmm5 + + align 4 + wloop: + movdqa xmm1, [eax] + lea eax, [eax + 16] + movdqa xmm2, [edx] + lea edx, [edx + 16] + sub ecx, 16 + movdqa xmm3, xmm1 // abs trick + psubusb xmm1, xmm2 + psubusb xmm2, xmm3 + por xmm1, xmm2 + movdqa xmm2, xmm1 + punpcklbw xmm1, xmm5 + punpckhbw xmm2, xmm5 + pmaddwd xmm1, xmm1 + pmaddwd xmm2, xmm2 + paddd xmm0, xmm1 + paddd xmm0, xmm2 + jg wloop + + pshufd xmm1, xmm0, 0xee + paddd xmm0, xmm1 + pshufd xmm1, xmm0, 0x01 + paddd xmm0, xmm1 + movd eax, xmm0 + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. +#pragma warning(disable: 4752) +__declspec(naked) __declspec(align(16)) +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { + __asm { + mov eax, [esp + 4] // src_a + mov edx, [esp + 8] // src_b + mov ecx, [esp + 12] // count + vpxor ymm0, ymm0, ymm0 // sum + vpxor ymm5, ymm5, ymm5 // constant 0 for unpck + sub edx, eax + + align 4 + wloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + edx] + lea eax, [eax + 32] + sub ecx, 32 + vpsubusb ymm3, ymm1, ymm2 // abs difference trick + vpsubusb ymm2, ymm2, ymm1 + vpor ymm1, ymm2, ymm3 + vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. + vpunpckhbw ymm1, ymm1, ymm5 + vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. + vpmaddwd ymm1, ymm1, ymm1 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm0, ymm0, ymm2 + jg wloop + + vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. + vpaddd ymm0, ymm0, ymm1 + vpermq ymm1, ymm0, 0x02 // high + low lane. + vpaddd ymm0, ymm0, ymm1 + vmovd eax, xmm0 + vzeroupper + ret + } +} +#endif // _MSC_VER >= 1700 + +#define HAS_HASHDJB2_SSE41 +static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16 +static uvec32 kHashMul0 = { + 0x0c3525e1, // 33 ^ 15 + 0xa3476dc1, // 33 ^ 14 + 0x3b4039a1, // 33 ^ 13 + 0x4f5f0981, // 33 ^ 12 +}; +static uvec32 kHashMul1 = { + 0x30f35d61, // 33 ^ 11 + 0x855cb541, // 33 ^ 10 + 0x040a9121, // 33 ^ 9 + 0x747c7101, // 33 ^ 8 +}; +static uvec32 kHashMul2 = { + 0xec41d4e1, // 33 ^ 7 + 0x4cfa3cc1, // 33 ^ 6 + 0x025528a1, // 33 ^ 5 + 0x00121881, // 33 ^ 4 +}; +static uvec32 kHashMul3 = { + 0x00008c61, // 33 ^ 3 + 0x00000441, // 33 ^ 2 + 0x00000021, // 33 ^ 1 + 0x00000001, // 33 ^ 0 +}; + +// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6 +// 44: 66 0F 38 40 DD pmulld xmm3,xmm5 +// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5 +// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5 +// 83: 66 0F 38 40 CD pmulld xmm1,xmm5 +#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ + _asm _emit 0x40 _asm _emit reg + +__declspec(naked) __declspec(align(16)) +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + + pxor xmm7, xmm7 // constant 0 for unpck + movdqa xmm6, kHash16x33 + + align 4 + wloop: + movdqu xmm1, [eax] // src[0-15] + lea eax, [eax + 16] + pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16 + movdqa xmm5, kHashMul0 + movdqa xmm2, xmm1 + punpcklbw xmm2, xmm7 // src[0-7] + movdqa xmm3, xmm2 + punpcklwd xmm3, xmm7 // src[0-3] + pmulld(0xdd) // pmulld xmm3, xmm5 + movdqa xmm5, kHashMul1 + movdqa xmm4, xmm2 + punpckhwd xmm4, xmm7 // src[4-7] + pmulld(0xe5) // pmulld xmm4, xmm5 + movdqa xmm5, kHashMul2 + punpckhbw xmm1, xmm7 // src[8-15] + movdqa xmm2, xmm1 + punpcklwd xmm2, xmm7 // src[8-11] + pmulld(0xd5) // pmulld xmm2, xmm5 + movdqa xmm5, kHashMul3 + punpckhwd xmm1, xmm7 // src[12-15] + pmulld(0xcd) // pmulld xmm1, xmm5 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} + +// Visual C 2012 required for AVX2. +#if _MSC_VER >= 1700 +__declspec(naked) __declspec(align(16)) +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { + __asm { + mov eax, [esp + 4] // src + mov ecx, [esp + 8] // count + movd xmm0, [esp + 12] // seed + movdqa xmm6, kHash16x33 + + align 4 + wloop: + vpmovzxbd xmm3, dword ptr [eax] // src[0-3] + pmulld xmm0, xmm6 // hash *= 33 ^ 16 + vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7] + pmulld xmm3, kHashMul0 + vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11] + pmulld xmm4, kHashMul1 + vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15] + pmulld xmm2, kHashMul2 + lea eax, [eax + 16] + pmulld xmm1, kHashMul3 + paddd xmm3, xmm4 // add 16 results + paddd xmm1, xmm2 + sub ecx, 16 + paddd xmm1, xmm3 + pshufd xmm2, xmm1, 0x0e // upper 2 dwords + paddd xmm1, xmm2 + pshufd xmm2, xmm1, 0x01 + paddd xmm1, xmm2 + paddd xmm0, xmm1 + jg wloop + + movd eax, xmm0 // return hash + ret + } +} +#endif // _MSC_VER >= 1700 + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/chromium/third_party/libyuv/source/convert.cc b/chromium/third_party/libyuv/source/convert.cc index 980df7edd5e..9ec71058ce9 100644 --- a/chromium/third_party/libyuv/source/convert.cc +++ b/chromium/third_party/libyuv/source/convert.cc @@ -22,7 +22,43 @@ namespace libyuv { extern "C" { #endif +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// Any I4xx To I420 format with mirroring. +static int I4xxToI420(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int src_uv_width, int src_uv_height) { + if (src_y_width == 0 || src_y_height == 0 || + src_uv_width == 0 || src_uv_height == 0) { + return -1; + } + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); + const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + // Copy I420 with optional flipping +// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure +// is does row coalescing. LIBYUV_API int I420Copy(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -39,7 +75,7 @@ int I420Copy(const uint8* src_y, int src_stride_y, // Negative height means invert the image. if (height < 0) { height = -height; - int halfheight = (height + 1) >> 1; + const int halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; @@ -48,16 +84,19 @@ int I420Copy(const uint8* src_y, int src_stride_y, src_stride_v = -src_stride_v; } - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } + // Copy UV planes. + const int halfwidth = (width + 1) >> 1; + const int halfheight = (height + 1) >> 1; CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } +// 422 chroma is 1/2 width, 1x height +// 420 chroma is 1/2 width, 1/2 height LIBYUV_API int I422ToI420(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -66,88 +105,19 @@ int I422ToI420(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - int halfwidth = (width + 1) >> 1; - void (*HalfRow)(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) = HalfRow_C; -#if defined(HAS_HALFROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) && - IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && - IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && - IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && - IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { - HalfRow = HalfRow_SSE2; - } -#endif -#if defined(HAS_HALFROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) { - HalfRow = HalfRow_AVX2; - } -#endif -#if defined(HAS_HALFROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { - HalfRow = HalfRow_NEON; - } -#endif - - // Copy Y plane - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - // SubSample U plane. - int y; - for (y = 0; y < height - 1; y += 2) { - HalfRow(src_u, src_stride_u, dst_u, halfwidth); - src_u += src_stride_u * 2; - dst_u += dst_stride_u; - } - if (height & 1) { - HalfRow(src_u, 0, dst_u, halfwidth); - } - - // SubSample V plane. - for (y = 0; y < height - 1; y += 2) { - HalfRow(src_v, src_stride_v, dst_v, halfwidth); - src_v += src_stride_v * 2; - dst_v += dst_stride_v; - } - if (height & 1) { - HalfRow(src_v, 0, dst_v, halfwidth); - } - return 0; + const int src_uv_width = SUBSAMPLE(width, 1, 1); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); } -// Blends 32x2 pixels to 16x1 -// source in scale.cc -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_SCALEROWDOWN2_NEON -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -#elif !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) - -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -#endif -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - +// 444 chroma is 1x width, 1x height +// 420 chroma is 1/2 width, 1/2 height LIBYUV_API int I444ToI420(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -156,69 +126,16 @@ int I444ToI420(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - int halfwidth = (width + 1) >> 1; - void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; -#if defined(HAS_SCALEROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(halfwidth, 16)) { - ScaleRowDown2 = ScaleRowDown2Box_NEON; - } -#elif defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(halfwidth, 16) && - IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && - IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && - IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && - IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { - ScaleRowDown2 = ScaleRowDown2Box_SSE2; - } -#endif - - // Copy Y plane - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - // SubSample U plane. - int y; - for (y = 0; y < height - 1; y += 2) { - ScaleRowDown2(src_u, src_stride_u, dst_u, halfwidth); - src_u += src_stride_u * 2; - dst_u += dst_stride_u; - } - if (height & 1) { - ScaleRowDown2(src_u, 0, dst_u, halfwidth); - } - - // SubSample V plane. - for (y = 0; y < height - 1; y += 2) { - ScaleRowDown2(src_v, src_stride_v, dst_v, halfwidth); - src_v += src_stride_v * 2; - dst_v += dst_stride_v; - } - if (height & 1) { - ScaleRowDown2(src_v, 0, dst_v, halfwidth); - } - return 0; + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + width, height); } -// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler. // 411 chroma is 1/4 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API @@ -229,41 +146,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - src_y = src_y + (height - 1) * src_stride_y; - src_u = src_u + (height - 1) * src_stride_u; - src_v = src_v + (height - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - - // Copy Y plane - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - int quarterwidth = (width + 3) >> 2; - - // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height. - ScalePlane(src_u, src_stride_u, quarterwidth, height, - dst_u, dst_stride_u, halfwidth, halfheight, - kFilterNone); - - // Resample V plane. - ScalePlane(src_v, src_stride_v, quarterwidth, height, - dst_v, dst_stride_v, halfwidth, halfheight, - kFilterNone); - return 0; + const int src_uv_width = SUBSAMPLE(width, 3, 2); + return I4xxToI420(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + src_uv_width, height); } // I400 is greyscale typically used in MJPG @@ -309,7 +200,6 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, } #endif #if defined(HAS_COPYROW_ERMS) - // TODO(fbarchard): Detect Fast String support. if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; } @@ -369,20 +259,23 @@ static int X420ToI420(const uint8* src_y, dst_stride_u = -dst_stride_u; dst_stride_v = -dst_stride_v; } - // Coalesce contiguous rows. + // Coalesce rows. int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (src_stride_y0 == width && src_stride_y1 == width && dst_stride_y == width) { - width = width * height; + width *= height; height = 1; + src_stride_y0 = src_stride_y1 = dst_stride_y = 0; } - if (src_stride_uv == width && - dst_stride_u * 2 == width && - dst_stride_v * 2 == width) { - halfwidth = halfwidth * halfheight; + // Coalesce rows. + if (src_stride_uv == halfwidth * 2 && + dst_stride_u == halfwidth && + dst_stride_v == halfwidth) { + halfwidth *= halfheight; halfheight = 1; + src_stride_uv = dst_stride_u = dst_stride_v = 0; } void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = SplitUVRow_C; @@ -782,7 +675,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; -#if defined(HAS_ARGBTOYROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -798,7 +691,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } } #endif -#if defined(HAS_ARGBTOYROW_AVX2) +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; @@ -1044,7 +937,7 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, } // Convert RGB24 to I420. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -1147,7 +1040,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } // Convert RAW to I420. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int RAWToI420(const uint8* src_raw, int src_stride_raw, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -1250,7 +1143,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } // Convert RGB565 to I420. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -1353,7 +1246,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } // Convert ARGB1555 to I420. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -1458,7 +1351,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } // Convert ARGB4444 to I420. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, diff --git a/chromium/third_party/libyuv/source/convert_argb.cc b/chromium/third_party/libyuv/source/convert_argb.cc index 55d4d6904ce..0a503361d8b 100644 --- a/chromium/third_party/libyuv/source/convert_argb.cc +++ b/chromium/third_party/libyuv/source/convert_argb.cc @@ -63,16 +63,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u == width && src_stride_v == width && dst_stride_argb == width * 4) { - return I444ToARGB(src_y, 0, - src_u, 0, - src_v, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf, @@ -126,16 +124,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_argb == width * 4) { - return I422ToARGB(src_y, 0, - src_u, 0, - src_v, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, @@ -207,16 +203,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u * 4 == width && src_stride_v * 4 == width && dst_stride_argb == width * 4) { - return I411ToARGB(src_y, 0, - src_u, 0, - src_v, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } void (*I411ToARGBRow)(const uint8* y_buf, const uint8* u_buf, @@ -267,12 +261,12 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && dst_stride_argb == width * 4) { - return I400ToARGB_Reference(src_y, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; } void (*YToARGBRow)(const uint8* y_buf, uint8* rgb_buf, @@ -317,12 +311,12 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && dst_stride_argb == width * 4) { - return I400ToARGB(src_y, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; } void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = I400ToARGBRow_C; @@ -353,17 +347,17 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, } // Shuffle table for converting BGRA to ARGB. -static const uvec8 kShuffleMaskBGRAToARGB = { +static uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; // Shuffle table for converting ABGR to ARGB. -static const uvec8 kShuffleMaskABGRToARGB = { +static uvec8 kShuffleMaskABGRToARGB = { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; // Shuffle table for converting RGBA to ARGB. -static const uvec8 kShuffleMaskRGBAToARGB = { +static uvec8 kShuffleMaskRGBAToARGB = { 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u }; @@ -415,12 +409,12 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { - return RGB24ToARGB(src_rgb24, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_rgb24 = dst_stride_argb = 0; } void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = RGB24ToARGBRow_C; @@ -464,12 +458,12 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { - return RAWToARGB(src_raw, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_raw = dst_stride_argb = 0; } void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = RAWToARGBRow_C; @@ -513,12 +507,12 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; src_stride_rgb565 = -src_stride_rgb565; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { - return RGB565ToARGB(src_rgb565, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_rgb565 = dst_stride_argb = 0; } void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = RGB565ToARGBRow_C; @@ -562,12 +556,12 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { - return ARGB1555ToARGB(src_argb1555, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb1555 = dst_stride_argb = 0; } void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, int pix) = ARGB1555ToARGBRow_C; @@ -611,12 +605,12 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { - return ARGB4444ToARGB(src_argb4444, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb4444 = dst_stride_argb = 0; } void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, int pix) = ARGB4444ToARGBRow_C; @@ -812,13 +806,13 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } - // Coalesce contiguous rows. + // Coalesce rows. if (width * height <= kMaxStride && src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { - return YUY2ToARGB(src_yuy2, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_argb = 0; } void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = YUY2ToARGBRow_C; @@ -865,13 +859,13 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } - // Coalesce contiguous rows. + // Coalesce rows. if (width * height <= kMaxStride && src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { - return UYVYToARGB(src_uyvy, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_uyvy = dst_stride_argb = 0; } void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = UYVYToARGBRow_C; diff --git a/chromium/third_party/libyuv/source/convert_from.cc b/chromium/third_party/libyuv/source/convert_from.cc index 87f9b5cb726..dc708de5e0b 100644 --- a/chromium/third_party/libyuv/source/convert_from.cc +++ b/chromium/third_party/libyuv/source/convert_from.cc @@ -25,6 +25,42 @@ namespace libyuv { extern "C" { #endif +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// I420 To any I4xx YUV format with mirroring. +static int I420ToI4xx(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_y_width, int src_y_height, + int dst_uv_width, int dst_uv_height) { + if (src_y_width == 0 || src_y_height == 0 || + dst_uv_width <= 0 || dst_uv_height <= 0) { + return -1; + } + const int dst_y_width = Abs(src_y_width); + const int dst_y_height = Abs(src_y_height); + const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); + const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); + ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, + dst_y, dst_stride_y, dst_y_width, dst_y_height, + kFilterBilinear); + ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, + dst_u, dst_stride_u, dst_uv_width, dst_uv_height, + kFilterBilinear); + ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, + dst_v, dst_stride_v, dst_uv_width, dst_uv_height, + kFilterBilinear); + return 0; +} + +// 420 chroma is 1/2 width, 1/2 height +// 422 chroma is 1/2 width, 1x height LIBYUV_API int I420ToI422(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -33,84 +69,20 @@ int I420ToI422(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (height - 1) * dst_stride_u; - dst_v = dst_v + (height - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - int halfwidth = (width + 1) >> 1; - void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; -#if defined(HAS_COPYROW_X86) - if (IS_ALIGNED(halfwidth, 4)) { - CopyRow = CopyRow_X86; - } -#endif -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) && - IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && - IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && - IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && - IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { - CopyRow = CopyRow_SSE2; - } -#endif -#if defined(HAS_COPYROW_ERMS) - if (TestCpuFlag(kCpuHasERMS)) { - CopyRow = CopyRow_ERMS; - } -#endif -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 32)) { - CopyRow = CopyRow_NEON; - } -#endif -#if defined(HAS_COPYROW_MIPS) - if (TestCpuFlag(kCpuHasMIPS)) { - CopyRow = CopyRow_MIPS; - } -#endif - - // Copy Y plane - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - // UpSample U plane. - int y; - for (y = 0; y < height - 1; y += 2) { - CopyRow(src_u, dst_u, halfwidth); - CopyRow(src_u, dst_u + dst_stride_u, halfwidth); - src_u += src_stride_u; - dst_u += dst_stride_u * 2; - } - if (height & 1) { - CopyRow(src_u, dst_u, halfwidth); - } - - // UpSample V plane. - for (y = 0; y < height - 1; y += 2) { - CopyRow(src_v, dst_v, halfwidth); - CopyRow(src_v, dst_v + dst_stride_v, halfwidth); - src_v += src_stride_v; - dst_v += dst_stride_v * 2; - } - if (height & 1) { - CopyRow(src_v, dst_v, halfwidth); - } - return 0; + const int dst_uv_width = (Abs(width) + 1) >> 1; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); } -// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler. +// 420 chroma is 1/2 width, 1/2 height +// 444 chroma is 1x width, 1x height LIBYUV_API int I420ToI444(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -119,40 +91,16 @@ int I420ToI444(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (!src_y || !src_u|| !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (height - 1) * dst_stride_u; - dst_v = dst_v + (height - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - - // Copy Y plane - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - - // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height. - ScalePlane(src_u, src_stride_u, halfwidth, halfheight, - dst_u, dst_stride_u, width, height, - kFilterNone); - - // Upsample V plane. - ScalePlane(src_v, src_stride_v, halfwidth, halfheight, - dst_v, dst_stride_v, width, height, - kFilterNone); - return 0; + const int dst_uv_width = Abs(width); + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); } // 420 chroma is 1/2 width, 1/2 height @@ -165,41 +113,16 @@ int I420ToI411(const uint8* src_y, int src_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (!src_y || !src_u || !src_v || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_y = dst_y + (height - 1) * dst_stride_y; - dst_u = dst_u + (height - 1) * dst_stride_u; - dst_v = dst_v + (height - 1) * dst_stride_v; - dst_stride_y = -dst_stride_y; - dst_stride_u = -dst_stride_u; - dst_stride_v = -dst_stride_v; - } - - // Copy Y plane - if (dst_y) { - CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); - } - - int halfwidth = (width + 1) >> 1; - int halfheight = (height + 1) >> 1; - int quarterwidth = (width + 3) >> 2; - - // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height - ScalePlane(src_u, src_stride_u, halfwidth, halfheight, - dst_u, dst_stride_u, quarterwidth, height, - kFilterNone); - - // Resample V plane. - ScalePlane(src_v, src_stride_v, halfwidth, halfheight, - dst_v, dst_stride_v, quarterwidth, height, - kFilterNone); - return 0; + const int dst_uv_width = (Abs(width) + 3) >> 2; + const int dst_uv_height = Abs(height); + return I420ToI4xx(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + width, height, + dst_uv_width, dst_uv_height); } // Copy to I400. Source can be I420,422,444,400,NV12,NV21 @@ -237,16 +160,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { - return I422ToYUY2(src_y, 0, - src_u, 0, - src_v, 0, - dst_yuy2, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; } void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_yuy2, int width) = @@ -343,16 +264,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { - return I422ToUYVY(src_y, 0, - src_u, 0, - src_v, 0, - dst_uyvy, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; } void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_uyvy, int width) = @@ -453,19 +372,22 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, dst_stride_y = -dst_stride_y; dst_stride_uv = -dst_stride_uv; } - // Coalesce contiguous rows. + // Coalesce rows. int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (src_stride_y == width && dst_stride_y == width) { - width = width * height; + width *= height; height = 1; + src_stride_y = dst_stride_y = 0; } - if (src_stride_u * 2 == width && - src_stride_v * 2 == width && - dst_stride_uv == width) { - halfwidth = halfwidth * halfheight; + // Coalesce rows. + if (src_stride_u == halfwidth && + src_stride_v == halfwidth && + dst_stride_uv == halfwidth * 2) { + halfwidth *= halfheight; halfheight = 1; + src_stride_u = src_stride_v = dst_stride_uv = 0; } void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUVRow_C; diff --git a/chromium/third_party/libyuv/source/convert_from_argb.cc b/chromium/third_party/libyuv/source/convert_from_argb.cc index 418f44d0cf5..9d5752cbb09 100644 --- a/chromium/third_party/libyuv/source/convert_from_argb.cc +++ b/chromium/third_party/libyuv/source/convert_from_argb.cc @@ -36,32 +36,30 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_y == width && dst_stride_u == width && dst_stride_v == width) { - return ARGBToI444(src_argb, 0, - dst_y, 0, - dst_u, 0, - dst_v, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV444Row_C; -#if defined(HAS_ARGBTOUV444ROW_SSSE3)
- if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
- ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
- if (IS_ALIGNED(width, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
- if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
- ARGBToUV444Row = ARGBToUV444Row_SSSE3;
- }
- }
- }
-#endif
+#if defined(HAS_ARGBTOUV444ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV444Row = ARGBToUV444Row_SSSE3; + } + } + } +#endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -111,16 +109,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_y == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width) { - return ARGBToI422(src_argb, 0, - dst_y, 0, - dst_u, 0, - dst_v, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV422Row_C; @@ -190,16 +186,14 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_y == width && dst_stride_u * 4 == width && dst_stride_v * 4 == width) { - return ARGBToI411(src_argb, 0, - dst_y, 0, - dst_u, 0, - dst_v, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV411Row_C; @@ -251,7 +245,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, return 0; } -LIBYUV_API +LIBYUV_API SAFEBUFFERS int ARGBToNV12(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, uint8* dst_uv, int dst_stride_uv, @@ -272,7 +266,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; -#if defined(HAS_ARGBTOYROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -353,7 +347,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } // Same as NV12 but U and V swapped. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int ARGBToNV21(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, uint8* dst_uv, int dst_stride_uv, @@ -374,7 +368,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; -#if defined(HAS_ARGBTOYROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -455,7 +449,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } // Convert ARGB to YUY2. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, uint8* dst_yuy2, int dst_stride_yuy2, int width, int height) { @@ -470,13 +464,13 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } - // Coalesce contiguous rows. + // Coalesce rows. if (width * height <= kMaxStride && src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { - return ARGBToYUY2(src_argb, 0, - dst_yuy2, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_yuy2 = 0; } void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV422Row_C; @@ -551,7 +545,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } // Convert ARGB to UYVY. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, uint8* dst_uyvy, int dst_stride_uyvy, int width, int height) { @@ -566,13 +560,13 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } - // Coalesce contiguous rows. + // Coalesce rows. if (width * height <= kMaxStride && src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { - return ARGBToUYVY(src_argb, 0, - dst_uyvy, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_uyvy = 0; } void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV422Row_C; @@ -659,12 +653,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_y == width) { - return ARGBToI400(src_argb, 0, - dst_y, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_y = 0; } void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; @@ -706,7 +700,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } // Shuffle table for converting ARGB to RGBA. -static const uvec8 kShuffleMaskARGBToRGBA = { +static uvec8 kShuffleMaskARGBToRGBA = { 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u }; @@ -734,19 +728,17 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { - return ARGBToRGB24(src_argb, 0, - dst_rgb24, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb24 = 0; } void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB24Row_C; #if defined(HAS_ARGBTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; @@ -782,19 +774,17 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { - return ARGBToRAW(src_argb, 0, - dst_raw, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_raw = 0; } void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRAWRow_C; #if defined(HAS_ARGBTORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; @@ -830,12 +820,12 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { - return ARGBToRGB565(src_argb, 0, - dst_rgb565, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_rgb565 = 0; } void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB565Row_C; @@ -877,12 +867,12 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { - return ARGBToARGB1555(src_argb, 0, - dst_argb1555, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_argb1555 = 0; } void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToARGB1555Row_C; @@ -924,12 +914,12 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { - return ARGBToARGB4444(src_argb, 0, - dst_argb4444, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_argb4444 = 0; } void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToARGB4444Row_C; @@ -980,7 +970,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = ARGBToYJRow_C; -#if defined(HAS_ARGBTOYJROW_SSSE3) +#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; @@ -996,7 +986,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_AVX2) +#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -1048,12 +1038,12 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_yj == width) { - return ARGBToJ400(src_argb, 0, - dst_yj, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_yj = 0; } void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = ARGBToYJRow_C; diff --git a/chromium/third_party/libyuv/source/convert_to_argb.cc b/chromium/third_party/libyuv/source/convert_to_argb.cc index 95b6386d719..aa6185661cd 100644 --- a/chromium/third_party/libyuv/source/convert_to_argb.cc +++ b/chromium/third_party/libyuv/source/convert_to_argb.cc @@ -61,15 +61,15 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, bool need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample; uint8* tmp_argb = dst_argb; int tmp_argb_stride = argb_stride; - uint8* buf = NULL; + uint8* rotate_buffer = NULL; int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height; if (need_buf) { int argb_size = dst_width * abs_dst_height * 4; - buf = new uint8[argb_size]; - if (!buf) { + rotate_buffer = new uint8[argb_size]; + if (!rotate_buffer) { return 1; // Out of memory runtime error. } - dst_argb = buf; + dst_argb = rotate_buffer; argb_stride = dst_width; } @@ -312,7 +312,7 @@ int ConvertToARGB(const uint8* sample, size_t sample_size, tmp_argb, tmp_argb_stride, dst_width, abs_dst_height, rotation); } - delete buf; + delete [] rotate_buffer; } return r; diff --git a/chromium/third_party/libyuv/source/convert_to_i420.cc b/chromium/third_party/libyuv/source/convert_to_i420.cc index 763eb50920e..5683ffe43ab 100644 --- a/chromium/third_party/libyuv/source/convert_to_i420.cc +++ b/chromium/third_party/libyuv/source/convert_to_i420.cc @@ -68,16 +68,16 @@ int ConvertToI420(const uint8* sample, int tmp_y_stride = y_stride; int tmp_u_stride = u_stride; int tmp_v_stride = v_stride; - uint8* buf = NULL; + uint8* rotate_buffer = NULL; int abs_dst_height = (dst_height < 0) ? -dst_height : dst_height; if (need_buf) { int y_size = dst_width * abs_dst_height; int uv_size = ((dst_width + 1) / 2) * ((abs_dst_height + 1) / 2); - buf = new uint8[y_size + uv_size * 2]; - if (!buf) { + rotate_buffer = new uint8[y_size + uv_size * 2]; + if (!rotate_buffer) { return 1; // Out of memory runtime error. } - y = buf; + y = rotate_buffer; u = y + y_size; v = u + uv_size; y_stride = dst_width; @@ -372,7 +372,7 @@ int ConvertToI420(const uint8* sample, tmp_v, tmp_v_stride, dst_width, abs_dst_height, rotation); } - delete buf; + delete [] rotate_buffer; } return r; diff --git a/chromium/third_party/libyuv/source/cpu_id.cc b/chromium/third_party/libyuv/source/cpu_id.cc index b4c993a2740..c4f840abb14 100644 --- a/chromium/third_party/libyuv/source/cpu_id.cc +++ b/chromium/third_party/libyuv/source/cpu_id.cc @@ -11,14 +11,16 @@ #include "libyuv/cpu_id.h" #ifdef _MSC_VER -#include <intrin.h> // For __cpuid() +#include <intrin.h> // For __cpuidex() #endif -#if !defined(__CLR_VER) && defined(_M_X64) && \ +#if !defined(__CLR_VER) && !defined(__native_client__) && defined(_M_X64) && \ defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) #include <immintrin.h> // For _xgetbv() #endif +#if !defined(__native_client__) #include <stdlib.h> // For getenv() +#endif // For ArmCpuCaps() but unittested on all platforms #include <stdio.h> @@ -26,92 +28,102 @@ #include "libyuv/basic_types.h" // For CPU_X86 -// TODO(fbarchard): Consider cpu functionality for breakpoints, timer and cache. -// arm - bkpt vs intel int 3 - -// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux. -#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( // NOLINT - "mov %%ebx, %%edi \n" - "cpuid \n" - "xchg %%edi, %%ebx \n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type)); -} -#elif defined(__i386__) || defined(__x86_64__) -static __inline void __cpuid(int cpu_info[4], int info_type) { - asm volatile ( // NOLINT - "cpuid \n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type)); -} -#endif - #ifdef __cplusplus namespace libyuv { extern "C" { #endif +// For functions that use rowbuffer and have runtime checks for overflow, +// use SAFEBUFFERS to avoid additional check. +#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#define SAFEBUFFERS __declspec(safebuffers) +#else +#define SAFEBUFFERS +#endif + // Low level cpuid for X86. Returns zeros on other CPUs. #if !defined(__CLR_VER) && (defined(_M_IX86) || defined(_M_X64) || \ defined(__i386__) || defined(__x86_64__)) LIBYUV_API -void CpuId(int cpu_info[4], int info_type) { - __cpuid(cpu_info, info_type); -} +void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { +#if defined(_MSC_VER) +#if (_MSC_FULL_VER >= 160040219) + __cpuidex(reinterpret_cast<int*>(cpu_info), info_eax, info_ecx); +#elif defined(_M_IX86) + __asm { + mov eax, info_eax + mov ecx, info_ecx + mov edi, cpu_info + cpuid + mov [edi], eax + mov [edi + 4], ebx + mov [edi + 8], ecx + mov [edi + 12], edx + } #else -LIBYUV_API -void CpuId(int cpu_info[4], int) { - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -} + if (info_ecx == 0) { + __cpuid(reinterpret_cast<int*>(cpu_info), info_eax); + } else { + cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; + } #endif - -// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. -#if !defined(__CLR_VER) && defined(_M_X64) && \ - defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) -#define HAS_XGETBV -static uint32 XGetBV(unsigned int xcr) { - return static_cast<uint32>(_xgetbv(xcr)); +#else // defined(_MSC_VER) + uint32 info_ebx, info_edx; + asm volatile ( // NOLINT +#if defined( __i386__) && defined(__PIC__) + // Preserve ebx for fpic 32 bit. + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" + : "=D" (info_ebx), +#else + "cpuid \n" + : "=b" (info_ebx), +#endif // defined( __i386__) && defined(__PIC__) + "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx)); + cpu_info[0] = info_eax; + cpu_info[1] = info_ebx; + cpu_info[2] = info_ecx; + cpu_info[3] = info_edx; +#endif // defined(_MSC_VER) } -#elif !defined(__CLR_VER) && defined(_M_IX86) && defined(_MSC_VER) + +#if !defined(__native_client__) #define HAS_XGETBV -__declspec(naked) __declspec(align(16)) -static uint32 XGetBV(unsigned int xcr) { +// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. +int TestOsSaveYmm() { + uint32 xcr0 = 0u; +#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) + xcr0 = static_cast<uint32>(_xgetbv(0)); // VS2010 SP1 required. +#elif defined(_M_IX86) __asm { - mov ecx, [esp + 4] // xcr - push edx - _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // xgetbv for vs2005. - pop edx - ret + xor ecx, ecx // xcr 0 + _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. + mov xcr0, eax } -} #elif defined(__i386__) || defined(__x86_64__) -#define HAS_XGETBV -static uint32 XGetBV(unsigned int xcr) { - uint32 xcr_feature_mask; - asm volatile ( // NOLINT - ".byte 0x0f, 0x01, 0xd0\n" - : "=a"(xcr_feature_mask) - : "c"(xcr) - : "memory", "cc", "edx"); // edx unused. - return xcr_feature_mask; + asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); +#endif // defined(_MSC_VER) + return((xcr0 & 6) == 6); // Is ymm saved? +} +#endif // !defined(__native_client__) +#else +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; } -#endif -#ifdef HAS_XGETBV -static const int kXCR_XFEATURE_ENABLED_MASK = 0; #endif // based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU -LIBYUV_API +LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { FILE* f = fopen(cpuinfo_name, "r"); if (f) { - char buf[512]; - while (fgets(buf, 511, f)) { - if (memcmp(buf, "Features", 8) == 0) { - char* p = strstr(buf, " neon"); + char cpuinfo_line[512]; + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { + if (memcmp(cpuinfo_line, "Features", 8) == 0) { + char* p = strstr(cpuinfo_line, " neon"); if (p && (p[5] == ' ' || p[5] == '\n')) { fclose(f); return kCpuHasNEON; @@ -129,7 +141,7 @@ static int MipsCpuCaps(const char* search_string) { char cpuinfo_line[256]; FILE* f = NULL; if ((f = fopen(file_name, "r")) != NULL) { - while (fgets(cpuinfo_line, sizeof(cpuinfo_line), f) != NULL) { + while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) { if (strstr(cpuinfo_line, search_string) != NULL) { fclose(f); return kCpuHasMIPS_DSP; @@ -148,6 +160,8 @@ int cpu_info_ = kCpuInit; // cpu_info is not initialized yet. // Test environment variable for disabling CPU features. Any non-zero value // to disable. Zero ignored to make it easy to set the variable on/off. +#if !defined(__native_client__) && !defined(_M_ARM) + static bool TestEnv(const char* name) { const char* var = getenv(name); if (var) { @@ -157,23 +171,29 @@ static bool TestEnv(const char* name) { } return false; } +#else // nacl does not support getenv(). +static bool TestEnv(const char*) { + return false; +} +#endif -LIBYUV_API +LIBYUV_API SAFEBUFFERS int InitCpuFlags(void) { #if !defined(__CLR_VER) && defined(CPU_X86) - int cpu_info1[4] = { 0, 0, 0, 0 }; - int cpu_info7[4] = { 0, 0, 0, 0 }; - __cpuid(cpu_info1, 1); - __cpuid(cpu_info7, 7); + uint32 cpu_info1[4] = { 0, 0, 0, 0 }; + uint32 cpu_info7[4] = { 0, 0, 0, 0 }; + CpuId(1, 0, cpu_info1); + CpuId(7, 0, cpu_info7); cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) | + ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | kCpuHasX86; #ifdef HAS_XGETBV if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave - (XGetBV(kXCR_XFEATURE_ENABLED_MASK) & 0x06) == 0x06) { // Saves YMM. + TestOsSaveYmm()) { // Saves YMM. cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX; } @@ -203,6 +223,9 @@ int InitCpuFlags(void) { if (TestEnv("LIBYUV_DISABLE_ERMS")) { cpu_info_ &= ~kCpuHasERMS; } + if (TestEnv("LIBYUV_DISABLE_FMA3")) { + cpu_info_ &= ~kCpuHasFMA3; + } #elif defined(__mips__) && defined(__linux__) // Linux mips parse text file for dsp detect. cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. @@ -221,10 +244,11 @@ int InitCpuFlags(void) { cpu_info_ &= ~kCpuHasMIPS_DSPR2; } #elif defined(__arm__) -#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) && \ + !defined(__native_client__) // Linux arm parse text file for neon detect. cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); -#elif defined(__ARM_NEON__) +#elif defined(__ARM_NEON__) || defined(__native_client__) // gcc -mfpu=neon defines __ARM_NEON__ // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags // to disable Neon on devices that do not have it. diff --git a/chromium/third_party/libyuv/source/format_conversion.cc b/chromium/third_party/libyuv/source/format_conversion.cc index 5b931b58773..cf7d6ea3af9 100644 --- a/chromium/third_party/libyuv/source/format_conversion.cc +++ b/chromium/third_party/libyuv/source/format_conversion.cc @@ -32,7 +32,7 @@ static int MakeSelectors(const int blue_index, const int green_index, const int red_index, uint32 dst_fourcc_bayer, - uint32 *index_map) { + uint32* index_map) { // Now build a lookup table containing the indices for the four pixels in each // 2x2 Bayer grid. switch (dst_fourcc_bayer) { @@ -280,7 +280,7 @@ int BayerToARGB(const uint8* src_bayer, int src_stride_bayer, } // Converts any Bayer RGB format to ARGB. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int BayerToI420(const uint8* src_bayer, int src_stride_bayer, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, @@ -310,7 +310,7 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; -#if defined(HAS_ARGBTOYROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -380,7 +380,7 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, } // Convert I420 to Bayer. -LIBYUV_API +LIBYUV_API SAFEBUFFERS int I420ToBayer(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, diff --git a/chromium/third_party/libyuv/source/mjpeg_decoder.cc b/chromium/third_party/libyuv/source/mjpeg_decoder.cc index 5d7296d7e73..bd423200531 100644 --- a/chromium/third_party/libyuv/source/mjpeg_decoder.cc +++ b/chromium/third_party/libyuv/source/mjpeg_decoder.cc @@ -420,9 +420,12 @@ void MJpegDecoder::ErrorHandler(j_common_ptr cinfo) { // recover from errors we use setjmp() as shown in their example. setjmp() is // C's implementation for the "call with current continuation" functionality // seen in some functional programming languages. + // A formatted message can be output, but is unsafe for release. +#ifdef DEBUG char buf[JMSG_LENGTH_MAX]; (*cinfo->err->format_message)(cinfo, buf); // ERROR: Error in jpeglib: buf +#endif SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err); // This rewinds the call stack to the point of the corresponding setjmp() diff --git a/chromium/third_party/libyuv/source/planar_functions.cc b/chromium/third_party/libyuv/source/planar_functions.cc index 2f70331327c..114faaef627 100644 --- a/chromium/third_party/libyuv/source/planar_functions.cc +++ b/chromium/third_party/libyuv/source/planar_functions.cc @@ -28,13 +28,12 @@ LIBYUV_API void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height) { - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { - CopyPlane(src_y, 0, - dst_y, 0, - width * height, 1); - return; + width *= height; + height = 1; + src_stride_y = dst_stride_y = 0; } void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_X86) @@ -173,10 +172,16 @@ int I420ToI400(const uint8* src_y, int src_stride_y, return 0; } -// Mirror a plane of data +// Mirror a plane of data. void MirrorPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { @@ -222,16 +227,14 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_yuy2 == width * 2 && dst_stride_y == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width) { - return YUY2ToI422(src_yuy2, 0, - dst_y, 0, - dst_u, 0, - dst_v, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; } void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); @@ -302,16 +305,14 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_uyvy == width * 2 && dst_stride_y == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width) { - return UYVYToI422(src_uyvy, 0, - dst_y, 0, - dst_u, 0, - dst_v, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; } void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); @@ -509,14 +510,13 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { - return ARGBBlend(src_argb0, 0, - src_argb1, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width) = GetARGBBlend(); @@ -545,16 +545,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { - return ARGBMultiply(src_argb0, 0, - src_argb1, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } - void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBMultiplyRow_C; #if defined(HAS_ARGBMULTIPLYROW_SSE2) @@ -607,16 +605,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { - return ARGBAdd(src_argb0, 0, - src_argb1, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } - void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBAddRow_C; #if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER) @@ -674,16 +670,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { - return ARGBSubtract(src_argb0, 0, - src_argb1, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } - void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBSubtractRow_C; #if defined(HAS_ARGBSUBTRACTROW_SSE2) @@ -739,16 +733,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra; dst_stride_bgra = -dst_stride_bgra; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_bgra == width * 4) { - return I422ToBGRA(src_y, 0, - src_u, 0, - src_v, 0, - dst_bgra, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0; } void (*I422ToBGRARow)(const uint8* y_buf, const uint8* u_buf, @@ -810,16 +802,14 @@ int I422ToABGR(const uint8* src_y, int src_stride_y, dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr; dst_stride_abgr = -dst_stride_abgr; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_abgr == width * 4) { - return I422ToABGR(src_y, 0, - src_u, 0, - src_v, 0, - dst_abgr, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0; } void (*I422ToABGRRow)(const uint8* y_buf, const uint8* u_buf, @@ -873,16 +863,14 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; dst_stride_rgba = -dst_stride_rgba; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_rgba == width * 4) { - return I422ToRGBA(src_y, 0, - src_u, 0, - src_v, 0, - dst_rgba, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0; } void (*I422ToRGBARow)(const uint8* y_buf, const uint8* u_buf, @@ -1016,12 +1004,11 @@ LIBYUV_API void SetPlane(uint8* dst_y, int dst_stride_y, int width, int height, uint32 value) { - // Coalesce contiguous rows. + // Coalesce rows. if (dst_stride_y == width) { - SetPlane(dst_y, 0, - width * height, 1, - value); - return; + width *= height; + height = 1; + dst_stride_y = 0; } void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C; #if defined(HAS_SETROW_NEON) @@ -1084,27 +1071,27 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, dst_x < 0 || dst_y < 0) { return -1; } - // Coalesce contiguous rows. + dst_argb += dst_y * dst_stride_argb + dst_x * 4; + // Coalesce rows. if (dst_stride_argb == width * 4) { - return ARGBRect(dst_argb, dst_stride_argb, - dst_x, dst_y, - width * height, 1, value); + width *= height; + height = 1; + dst_stride_argb = 0; } - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; #if defined(HAS_SETROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBSetRows_NEON(dst, value, width, dst_stride_argb, height); + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); return 0; } #endif #if defined(HAS_SETROW_X86) if (TestCpuFlag(kCpuHasX86)) { - ARGBSetRows_X86(dst, value, width, dst_stride_argb, height); + ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height); return 0; } #endif - ARGBSetRows_C(dst, value, width, dst_stride_argb, height); + ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height); return 0; } @@ -1133,12 +1120,12 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - return ARGBAttenuate(src_argb, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; } void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBAttenuateRow_C; @@ -1153,9 +1140,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; @@ -1200,19 +1185,17 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - return ARGBUnattenuate(src_argb, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; } void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBUnattenuateRow_C; #if defined(HAS_ARGBUNATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; @@ -1250,12 +1233,12 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - return ARGBGrayTo(src_argb, 0, - dst_argb, 0, - width * height, 1); + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; } void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBGrayRow_C; @@ -1287,11 +1270,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } - // Coalesce contiguous rows. + // Coalesce rows. if (dst_stride_argb == width * 4) { - return ARGBGray(dst_argb, dst_stride_argb, - dst_x, dst_y, - width * height, 1); + width *= height; + height = 1; + dst_stride_argb = 0; } void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb, int width) = ARGBGrayRow_C; @@ -1320,11 +1303,11 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } - // Coalesce contiguous rows. + // Coalesce rows. if (dst_stride_argb == width * 4) { - return ARGBSepia(dst_argb, dst_stride_argb, - dst_x, dst_y, - width * height, 1); + width *= height; + height = 1; + dst_stride_argb = 0; } void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; #if defined(HAS_ARGBSEPIAROW_SSSE3) @@ -1345,24 +1328,30 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, return 0; } -// Apply a 4x3 matrix rotation to each ARGB pixel. +// Apply a 4x4 matrix to each ARGB pixel. +// Note: Normally for shading, but can be used to swizzle or invert. LIBYUV_API -int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, const int8* matrix_argb, - int dst_x, int dst_y, int width, int height) { - if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + int width, int height) { + if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { return -1; } - // Coalesce contiguous rows. - if (dst_stride_argb == width * 4) { - return ARGBColorMatrix(dst_argb, dst_stride_argb, - matrix_argb, - dst_x, dst_y, - width * height, 1); + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; } - void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb, - int width) = ARGBColorMatrixRow_C; + void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -1373,14 +1362,48 @@ int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } #endif - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; for (int y = 0; y < height; ++y) { - ARGBColorMatrixRow(dst, matrix_argb, width); - dst += dst_stride_argb; + ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; } return 0; } +// Apply a 4x3 matrix to each ARGB pixel. +// Deprecated. +LIBYUV_API SAFEBUFFERS +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + + // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. + SIMD_ALIGNED(int8 matrix_argb[16]); + matrix_argb[0] = matrix_rgb[0] / 2; + matrix_argb[1] = matrix_rgb[1] / 2; + matrix_argb[2] = matrix_rgb[2] / 2; + matrix_argb[3] = matrix_rgb[3] / 2; + matrix_argb[4] = matrix_rgb[4] / 2; + matrix_argb[5] = matrix_rgb[5] / 2; + matrix_argb[6] = matrix_rgb[6] / 2; + matrix_argb[7] = matrix_rgb[7] / 2; + matrix_argb[8] = matrix_rgb[8] / 2; + matrix_argb[9] = matrix_rgb[9] / 2; + matrix_argb[10] = matrix_rgb[10] / 2; + matrix_argb[11] = matrix_rgb[11] / 2; + matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; + matrix_argb[15] = 64; // 1.0 + + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + return ARGBColorMatrix(const_cast<const uint8*>(dst), dst_stride_argb, + dst, dst_stride_argb, + &matrix_argb[0], width, height); +} + // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API @@ -1391,12 +1414,11 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, dst_x < 0 || dst_y < 0) { return -1; } - // Coalesce contiguous rows. + // Coalesce rows. if (dst_stride_argb == width * 4) { - return ARGBColorTable(dst_argb, dst_stride_argb, - table_argb, - dst_x, dst_y, - width * height, 1); + width *= height; + height = 1; + dst_stride_argb = 0; } void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, int width) = ARGBColorTableRow_C; @@ -1413,6 +1435,37 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, return 0; } +// Apply a color table each ARGB pixel but preserve destination alpha. +// Table contains 256 ARGB values. +LIBYUV_API +int RGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || !table_argb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + // Coalesce rows. + if (dst_stride_argb == width * 4) { + width *= height; + height = 1; + dst_stride_argb = 0; + } + void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = RGBColorTableRow_C; +#if defined(HAS_RGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + RGBColorTableRow = RGBColorTableRow_X86; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + RGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} + // ARGBQuantize is used to posterize art. // e.g. rgb / qvalue * qvalue + qvalue / 2 // But the low levels implement efficiently with 3 parameters, and could be @@ -1430,12 +1483,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb, interval_size < 1 || interval_size > 255) { return -1; } - // Coalesce contiguous rows. + // Coalesce rows. if (dst_stride_argb == width * 4) { - return ARGBQuantize(dst_argb, dst_stride_argb, - scale, interval_size, interval_offset, - dst_x, dst_y, - width * height, 1); + width *= height; + height = 1; + dst_stride_argb = 0; } void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) = ARGBQuantizeRow_C; @@ -1496,14 +1548,28 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } - void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum, + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (radius > height) { + radius = height; + } + if (radius > (width / 2 - 1)) { + radius = width / 2 - 1; + } + if (radius <= 0) { + return -1; + } + void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum, const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C; - void (*CUMULATIVESUMTOAVERAGEROW)(const int32* topleft, const int32* botleft, + void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft, int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C; #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; - CUMULATIVESUMTOAVERAGEROW = CumulativeSumToAverageRow_SSE2; + CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; } #endif // Compute enough CumulativeSum for first row to be blurred. After this @@ -1548,24 +1614,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, int boxwidth = radius * 4; int x; for (x = 0; x < radius + 1; ++x) { - CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], 1); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], 1); area += (bot_y - top_y); boxwidth += 4; } // Middle unclipped. int n = (width - 1) - radius - x + 1; - CUMULATIVESUMTOAVERAGEROW(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], n); + CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, + boxwidth, area, &dst_argb[x * 4], n); // Right clipped. for (x += n; x <= width - 1; ++x) { area -= (bot_y - top_y); boxwidth -= 4; - CUMULATIVESUMTOAVERAGEROW(cumsum_top_row + (x - radius - 1) * 4, - cumsum_bot_row + (x - radius - 1) * 4, - boxwidth, area, &dst_argb[x * 4], 1); + CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, + cumsum_bot_row + (x - radius - 1) * 4, + boxwidth, area, &dst_argb[x * 4], 1); } dst_argb += dst_stride_argb; } @@ -1585,13 +1651,12 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { - return ARGBShade(src_argb, 0, - dst_argb, 0, - width * height, 1, - value); + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; } void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) = ARGBShadeRow_C; @@ -1616,8 +1681,6 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, } // Interpolate 2 ARGB images by specified amount (0 to 255). -// TODO(fbarchard): Consider selecting a specialization for interpolation so -// row function doesn't need to check interpolation on each row. LIBYUV_API int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, @@ -1632,15 +1695,13 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { - return ARGBInterpolate(src_argb0, 0, - src_argb1, 0, - dst_argb, 0, - width * height, 1, - interpolation); + width *= height; + height = 1; + src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, @@ -1671,6 +1732,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, } } #endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 4) { InterpolateRow = InterpolateRow_Any_NEON; @@ -1713,16 +1782,23 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, src_bgra = src_bgra + (height - 1) * src_stride_bgra; src_stride_bgra = -src_stride_bgra; } - // Coalesce contiguous rows. + // Coalesce rows. if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { - return ARGBShuffle(src_bgra, 0, - dst_argb, 0, - shuffler, - width * height, 1); + width *= height; + height = 1; + src_stride_bgra = dst_stride_argb = 0; } void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb, const uint8* shuffler, int pix) = ARGBShuffleRow_C; +#if defined(HAS_ARGBSHUFFLEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { + ARGBShuffleRow = ARGBShuffleRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBShuffleRow = ARGBShuffleRow_SSE2; + } + } +#endif #if defined(HAS_ARGBSHUFFLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; @@ -1761,12 +1837,17 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } // Sobel ARGB effect. -LIBYUV_API -int ARGBSobel(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +static SAFEBUFFERS +int ARGBSobelize(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height, + void (*SobelRow)(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst, int width)) { + const int kMaxRow = kMaxStride / 4; + const int kEdge = 16; // Extra pixels at start of row for extrude/align. if (!src_argb || !dst_argb || - width <= 0 || height == 0 || width > (kMaxStride / 4)) { + width <= 0 || height == 0 || width > (kMaxRow - kEdge)) { return -1; } // Negative height means invert the image. @@ -1777,7 +1858,16 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, } // ARGBToBayer used to select G channel from ARGB. void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) = ARGBToBayerRow_C; + uint32 selector, int pix) = ARGBToBayerGGRow_C; +#if defined(HAS_ARGBTOBAYERGGROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGBToBayerRow = ARGBToBayerGGRow_SSE2; + } + } +#endif #if defined(HAS_ARGBTOBAYERROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { @@ -1786,19 +1876,20 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, ARGBToBayerRow = ARGBToBayerRow_SSSE3; } } -#elif defined(HAS_ARGBTOBAYERROW_NEON) +#endif +#if defined(HAS_ARGBTOBAYERGGROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_NEON; + ARGBToBayerRow = ARGBToBayerGGRow_NEON; } } #endif void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) = SobelYRow_C; -#if defined(HAS_SOBELYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SobelYRow = SobelYRow_SSSE3; +#if defined(HAS_SOBELYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelYRow = SobelYRow_SSE2; } #endif #if defined(HAS_SOBELYROW_NEON) @@ -1809,9 +1900,9 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobely, int width) = SobelXRow_C; -#if defined(HAS_SOBELXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SobelXRow = SobelXRow_SSSE3; +#if defined(HAS_SOBELXROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXRow = SobelXRow_SSE2; } #endif #if defined(HAS_SOBELXROW_NEON) @@ -1819,35 +1910,22 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, SobelXRow = SobelXRow_NEON; } #endif - void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelRow_C; -#if defined(HAS_SOBELROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - SobelRow = SobelRow_SSE2; - } -#endif -#if defined(HAS_SOBELROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - SobelRow = SobelRow_NEON; - } -#endif - - const int kEdge = 16; // Extra pixels at start of row for extrude/align. - SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]); - SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]); - SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]); + // 3 rows with edges before/after. + SIMD_ALIGNED(uint8 row_y[kEdge + kMaxRow * 3]); + SIMD_ALIGNED(uint8 row_sobelx[kMaxRow]); + SIMD_ALIGNED(uint8 row_sobely[kMaxRow]); // Convert first row. uint8* row_y0 = row_y + kEdge; - uint8* row_y1 = row_y0 + kMaxStride / 4; - uint8* row_y2 = row_y1 + kMaxStride / 4; + uint8* row_y1 = row_y0 + kMaxRow; + uint8* row_y2 = row_y1 + kMaxRow; ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); row_y0[-1] = row_y0[0]; - row_y0[width] = row_y0[width - 1]; + memset(row_y0 + width, row_y0[width - 1], 16); // extrude 16 pixels. ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); row_y1[-1] = row_y1[0]; - row_y1[width] = row_y1[width - 1]; + memset(row_y1 + width, row_y1[width - 1], 16); + memset(row_y2 + width, 0, 16); for (int y = 0; y < height; ++y) { // Convert next row of ARGB to Y. @@ -1873,14 +1951,80 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, return 0; } +// Sobel ARGB effect. +LIBYUV_API +int ARGBSobel(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelRow_C; +#if defined(HAS_SOBELROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + SobelRow = SobelRow_SSE2; + } +#endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelRow); +} + +// Sobel ARGB effect with planar output. +LIBYUV_API +int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, + uint8* dst_y, int dst_stride_y, + int width, int height) { + void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_, int width) = SobelToPlaneRow_C; +#if defined(HAS_SOBELTOPLANEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } +#endif +#if defined(HAS_SOBELTOPLANEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, + width, height, SobelToPlaneRow); +} + // SobelXY ARGB effect. // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. LIBYUV_API int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height) { - if (!src_argb || !dst_argb || - width <= 0 || height == 0 || width > kMaxStride / 4) { + void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelXYRow_C; +#if defined(HAS_SOBELXYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + SobelXYRow = SobelXYRow_SSE2; + } +#endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } +#endif + return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, + width, height, SobelXYRow); +} + +// Apply a 4x4 polynomial to each ARGB pixel. +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height) { + if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -1889,99 +2033,156 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // ARGBToBayer used to select G channel from ARGB. - void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) = ARGBToBayerRow_C; -#if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_SSSE3; - } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; } -#elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGBToBayerRow = ARGBToBayerRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_NEON; - } + void (*ARGBPolynomialRow)(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) = ARGBPolynomialRow_C; +#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_SSE2; } #endif - void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) = SobelYRow_C; -#if defined(HAS_SOBELYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SobelYRow = SobelYRow_SSSE3; +#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && + IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_AVX2; } #endif -#if defined(HAS_SOBELYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SobelYRow = SobelYRow_NEON; + for (int y = 0; y < height; ++y) { + ARGBPolynomialRow(src_argb, dst_argb, poly, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Apply a lumacolortable to each ARGB pixel. +LIBYUV_API +int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const uint8* luma, + int width, int height) { + if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } + void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, + int width, const uint8* luma, const uint32 lumacoeff) = + ARGBLumaColorTableRow_C; +#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { + ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; } #endif - void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobely, int width) = - SobelXRow_C; -#if defined(HAS_SOBELXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SobelXRow = SobelXRow_SSSE3; + for (int y = 0; y < height; ++y) { + ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy Alpha from one ARGB image to another. +LIBYUV_API +int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_argb || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_argb = 0; + } + void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) = + ARGBCopyAlphaRow_C; +#if defined(HAS_ARGBCOPYALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && + IS_ALIGNED(width, 8)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } #endif -#if defined(HAS_SOBELXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SobelXRow = SobelXRow_NEON; +#if defined(HAS_ARGBCOPYALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; } #endif - void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) = SobelXYRow_C; -#if defined(HAS_SOBELXYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - SobelXYRow = SobelXYRow_SSE2; + for (int y = 0; y < height; ++y) { + ARGBCopyAlphaRow(src_argb, dst_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Copy a planar Y channel to the alpha channel of a destination ARGB image. +LIBYUV_API +int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + if (!src_y || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_y = src_y + (height - 1) * src_stride_y; + src_stride_y = -src_stride_y; + } + // Coalesce rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_y = dst_stride_argb = 0; + } + void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) = + ARGBCopyYToAlphaRow_C; +#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) && + IS_ALIGNED(width, 8)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } #endif -#if defined(HAS_SOBELXYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - SobelXYRow = SobelXYRow_NEON; +#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) { + ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; } #endif - - const int kEdge = 16; // Extra pixels at start of row for extrude/align. - SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]); - SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]); - SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]); - - // Convert first row. - uint8* row_y0 = row_y + kEdge; - uint8* row_y1 = row_y0 + kMaxStride / 4; - uint8* row_y2 = row_y1 + kMaxStride / 4; - ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); - row_y0[-1] = row_y0[0]; - row_y0[width] = row_y0[width - 1]; - ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); - row_y1[-1] = row_y1[0]; - row_y1[width] = row_y1[width - 1]; - for (int y = 0; y < height; ++y) { - // Convert next row of ARGB to Y. - if (y < (height - 1)) { - src_argb += src_stride_argb; - } - ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width); - row_y2[-1] = row_y2[0]; - row_y2[width] = row_y2[width - 1]; - - SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); - SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); - SobelXYRow(row_sobelx, row_sobely, dst_argb, width); - - // Cycle thru circular queue of 3 row_y buffers. - uint8* row_yt = row_y0; - row_y0 = row_y1; - row_y1 = row_y2; - row_y2 = row_yt; - + ARGBCopyYToAlphaRow(src_y, dst_argb, width); + src_y += src_stride_y; dst_argb += dst_stride_argb; } return 0; diff --git a/chromium/third_party/libyuv/source/rotate.cc b/chromium/third_party/libyuv/source/rotate.cc index c46650b4458..b99cde10891 100644 --- a/chromium/third_party/libyuv/source/rotate.cc +++ b/chromium/third_party/libyuv/source/rotate.cc @@ -41,7 +41,7 @@ extern "C" { #endif #endif -#if !defined(LIBYUV_DISABLE_NEON) && \ +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_MIRRORROW_NEON void MirrorRow_NEON(const uint8* src, uint8* dst, int width); @@ -57,7 +57,8 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, int width); #endif // defined(__ARM_NEON__) -#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \ +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && \ defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_TRANSPOSE_WX8_MIPS_DSPR2 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, @@ -72,7 +73,8 @@ void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, int width); #endif // defined(__mips__) -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#if !defined(LIBYUV_DISABLE_X86) && \ + defined(_M_IX86) && defined(_MSC_VER) #define HAS_TRANSPOSE_WX8_SSSE3 __declspec(naked) __declspec(align(16)) static void TransposeWx8_SSSE3(const uint8* src, int src_stride, @@ -89,7 +91,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, // Read in the data from the source pointer. // First round of bit swap. - align 16 + align 4 convertloop: movq xmm0, qword ptr [eax] lea ebp, [eax + 8] @@ -188,7 +190,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, mov [esp + 16], ecx mov ecx, [ecx + 16 + 28] // w - align 16 + align 4 convertloop: // Read in the data from the source pointer. // First round of bit swap. @@ -294,14 +296,15 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, ret } } -#elif !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) +#elif !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) #define HAS_TRANSPOSE_WX8_SSSE3 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" "movq (%0),%%xmm0 \n" "movq (%0,%3),%%xmm1 \n" @@ -383,7 +386,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, ); } -#if !defined(LIBYUV_DISABLE_X86) && defined (__i386__) +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) #define HAS_TRANSPOSE_UVWX8_SSE2 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, @@ -503,9 +506,16 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "pop %edi \n" "pop %esi \n" "pop %ebx \n" +#if defined(__native_client__) + "pop %ecx \n" + "and $0xffffffe0,%ecx \n" + "jmp *%ecx \n" +#else "ret \n" +#endif ); -#elif !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) +#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ + defined(__x86_64__) // 64 bit version has enough registers to do 16x8 to 8x16 at a time. #define HAS_TRANSPOSE_WX8_FAST_SSSE3 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, @@ -513,7 +523,7 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa (%0,%3),%%xmm1 \n" @@ -654,7 +664,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa (%0,%4),%%xmm1 \n" @@ -857,7 +867,7 @@ void RotatePlane270(const uint8* src, int src_stride, TransposePlane(src, src_stride, dst, dst_stride, width, height); } -LIBYUV_API +LIBYUV_API SAFEBUFFERS void RotatePlane180(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { diff --git a/chromium/third_party/libyuv/source/rotate_argb.cc b/chromium/third_party/libyuv/source/rotate_argb.cc index 5fa0d7ea798..b95512783a0 100644 --- a/chromium/third_party/libyuv/source/rotate_argb.cc +++ b/chromium/third_party/libyuv/source/rotate_argb.cc @@ -22,14 +22,15 @@ extern "C" { // ARGBScale has a function to copy pixels to a row, striding each source // pixel by a constant. -#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \ - defined(__x86_64__) || defined(__i386__)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || \ + (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) #define HAS_SCALEARGBROWDOWNEVEN_SSE2 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, int src_stepx, uint8* dst_ptr, int dst_width); #endif -#if !defined(LIBYUV_DISABLE_NEON) && \ +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_SCALEARGBROWDOWNEVEN_NEON void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, @@ -88,6 +89,7 @@ void ARGBRotate270(const uint8* src, int src_stride, ARGBTranspose(src, src_stride, dst, dst_stride, width, height); } +SAFEBUFFERS void ARGBRotate180(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { diff --git a/chromium/third_party/libyuv/source/rotate_neon.cc b/chromium/third_party/libyuv/source/rotate_neon.cc index ab07c169703..a59c4d5fde6 100644 --- a/chromium/third_party/libyuv/source/rotate_neon.cc +++ b/chromium/third_party/libyuv/source/rotate_neon.cc @@ -18,7 +18,7 @@ extern "C" { #endif #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) -static const uvec8 kVTbl4x4Transpose = +static uvec8 kVTbl4x4Transpose = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; void TransposeWx8_NEON(const uint8* src, int src_stride, @@ -31,7 +31,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "sub %4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" "mov r9, %0 \n" @@ -184,7 +184,7 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ); } -static const uvec8 kVTbl4x4TransposeDi = +static uvec8 kVTbl4x4TransposeDi = { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; void TransposeUVWx8_NEON(const uint8* src, int src_stride, @@ -198,7 +198,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "sub %6, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" "mov r9, %0 \n" diff --git a/chromium/third_party/libyuv/source/row_any.cc b/chromium/third_party/libyuv/source/row_any.cc index 72100d90e9d..90c6a3ff5f8 100644 --- a/chromium/third_party/libyuv/source/row_any.cc +++ b/chromium/third_party/libyuv/source/row_any.cc @@ -137,8 +137,12 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, 3, 4, 2) RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, 3, 4, 2) +#endif +#if defined(HAS_I400TOARGBROW_SSE2) RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, 7, 1, 4) +#endif +#if defined(HAS_YTOARGBROW_SSE2) RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 7, 1, 4) RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, @@ -195,6 +199,15 @@ BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C, BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, 7, 4, 1) #endif +#if defined(HAS_ARGBTOBAYERGGROW_SSE2) +BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C, + 7, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERGGROW_NEON) +BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C, + 7, 4, 1) +#endif + #undef BAYERANY // RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. @@ -213,6 +226,8 @@ YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) #endif #ifdef HAS_ARGBTOYROW_SSSE3 YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) +#endif +#ifdef HAS_BGRATOYROW_SSSE3 YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) @@ -290,7 +305,7 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, width & MASK); \ } -#ifdef HAS_ARGBTOYROW_AVX2 +#ifdef HAS_ARGBTOUVROW_AVX2 UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31) UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) @@ -468,6 +483,10 @@ MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, dst_argb + n * BPP, shuffler, width & MASK); \ } +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, + ARGBShuffleRow_C, 4, 4, 3) +#endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3, ARGBShuffleRow_C, 4, 4, 7) @@ -495,6 +514,10 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, width & MASK, source_y_fraction); \ } +#ifdef HAS_INTERPOLATEROW_AVX2 +NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, + InterpolateRow_C, 1, 1, 32) +#endif #ifdef HAS_INTERPOLATEROW_SSSE3 NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3, InterpolateRow_C, 1, 1, 15) diff --git a/chromium/third_party/libyuv/source/row_common.cc b/chromium/third_party/libyuv/source/row_common.cc index badea440582..f961696f008 100644 --- a/chromium/third_party/libyuv/source/row_common.cc +++ b/chromium/third_party/libyuv/source/row_common.cc @@ -59,6 +59,11 @@ static __inline uint32 Abs(int32 v) { } #endif // USE_BRANCHLESS +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div) { + return static_cast<int>((static_cast<int64>(num) << 16) / div); +} + #ifdef LIBYUV_LITTLE_ENDIAN #define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v #else @@ -649,21 +654,27 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { } // Apply color matrix to a row of image. Matrix is signed. -void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) { +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { for (int x = 0; x < width; ++x) { - int b = dst_argb[0]; - int g = dst_argb[1]; - int r = dst_argb[2]; - int a = dst_argb[3]; + int b = src_argb[0]; + int g = src_argb[1]; + int r = src_argb[2]; + int a = src_argb[3]; int sb = (b * matrix_argb[0] + g * matrix_argb[1] + - r * matrix_argb[2] + a * matrix_argb[3]) >> 7; + r * matrix_argb[2] + a * matrix_argb[3]) >> 6; int sg = (b * matrix_argb[4] + g * matrix_argb[5] + - r * matrix_argb[6] + a * matrix_argb[7]) >> 7; + r * matrix_argb[6] + a * matrix_argb[7]) >> 6; int sr = (b * matrix_argb[8] + g * matrix_argb[9] + - r * matrix_argb[10] + a * matrix_argb[11]) >> 7; + r * matrix_argb[10] + a * matrix_argb[11]) >> 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + + r * matrix_argb[14] + a * matrix_argb[15]) >> 6; dst_argb[0] = Clamp(sb); dst_argb[1] = Clamp(sg); dst_argb[2] = Clamp(sr); + dst_argb[3] = Clamp(sa); + src_argb += 4; dst_argb += 4; } } @@ -683,6 +694,19 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { } } +// Apply color table to a row of image. +void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb += 4; + } +} + void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { for (int x = 0; x < width; ++x) { @@ -845,6 +869,16 @@ void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, } } +void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + for (int i = 0; i < width; ++i) { + int r = src_sobelx[i]; + int b = src_sobely[i]; + int s = clamp255(r + b); + dst_y[i] = static_cast<uint8>(s); + } +} + void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { for (int i = 0; i < width; ++i) { @@ -1670,7 +1704,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { // Reciprocal method is off by 1 on some values. ie 125 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. #define T(a) 0x01000000 + (0x10000 / a) -uint32 fixed_invtbl8[256] = { +const uint32 fixed_invtbl8[256] = { 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), @@ -1774,10 +1808,26 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } } +// Blend 2 rows into 1 for conversions such as I422ToI420. +void HalfRow_C(const uint8* src_uv, int src_uv_stride, + uint8* dst_uv, int pix) { + for (int x = 0; x < pix; ++x) { + dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; + } +} + // C version 2x2 -> 2x1. void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { + if (source_y_fraction == 0) { + memcpy(dst_ptr, src_ptr, width); + return; + } + if (source_y_fraction == 128) { + HalfRow_C(src_ptr, static_cast<int>(src_stride), dst_ptr, width); + return; + } int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; @@ -1794,14 +1844,6 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, } } -// Blend 2 rows into 1 for conversions such as I422ToI420. -void HalfRow_C(const uint8* src_uv, int src_uv_stride, - uint8* dst_uv, int pix) { - for (int x = 0; x < pix; ++x) { - dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; - } -} - // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { @@ -1819,6 +1861,21 @@ void ARGBToBayerRow_C(const uint8* src_argb, } } +// Select G channel from ARGB. e.g. GGGGGGGG +void ARGBToBayerGGRow_C(const uint8* src_argb, + uint8* dst_bayer, uint32 /*selector*/, int pix) { + // Copy a row of G. + for (int x = 0; x < pix - 1; x += 2) { + dst_bayer[0] = src_argb[1]; + dst_bayer[1] = src_argb[5]; + src_argb += 8; + dst_bayer += 2; + } + if (pix & 1) { + dst_bayer[0] = src_argb[1]; + } +} + // Use first 4 shuffler values to reorder ARGB channels. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { @@ -1886,10 +1943,19 @@ void I422ToUYVYRow_C(const uint8* src_y, } } -#if !defined(LIBYUV_DISABLE_X86) +// TODO(fbarchard): Ensure these are stack safe. +#ifdef DEBUG +#define MAYBE_SAFEBUFFERS +#else +#define MAYBE_SAFEBUFFERS SAFEBUFFERS +#endif + + +#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. 5% slower. // TODO(fbarchard): Handle width > kMaxStride here instead of calling code. #if defined(__x86_64__) || defined(__i386__) +MAYBE_SAFEBUFFERS void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1902,6 +1968,7 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, #endif // defined(__x86_64__) || defined(__i386__) #if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) +MAYBE_SAFEBUFFERS void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1912,6 +1979,7 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, ARGBToARGB1555Row_SSE2(row, rgb_buf, width); } +MAYBE_SAFEBUFFERS void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1922,6 +1990,7 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, ARGBToARGB4444Row_SSE2(row, rgb_buf, width); } +MAYBE_SAFEBUFFERS void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, @@ -1931,6 +2000,7 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, ARGBToRGB565Row_SSE2(row, dst_rgb565, width); } +MAYBE_SAFEBUFFERS void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, uint8* dst_rgb565, @@ -1940,6 +2010,7 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y, ARGBToRGB565Row_SSE2(row, dst_rgb565, width); } +MAYBE_SAFEBUFFERS void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { @@ -1951,6 +2022,7 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); } +MAYBE_SAFEBUFFERS void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { @@ -1962,6 +2034,7 @@ void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); } +MAYBE_SAFEBUFFERS void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { @@ -1973,6 +2046,7 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); } +MAYBE_SAFEBUFFERS void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { @@ -1986,8 +2060,102 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(LIBYUV_DISABLE_X86) -#undef clamp0 -#undef clamp255 + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + for (int i = 0; i < width; ++i) { + float b = static_cast<float>(src_argb[0]); + float g = static_cast<float>(src_argb[1]); + float r = static_cast<float>(src_argb[2]); + float a = static_cast<float>(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0] = Clamp(static_cast<int32>(db)); + dst_argb[1] = Clamp(static_cast<int32>(dg)); + dst_argb[2] = Clamp(static_cast<int32>(dr)); + dst_argb[3] = Clamp(static_cast<int32>(da)); + src_argb += 4; + dst_argb += 4; + } +} + +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, const uint32 lumacoeff) { + uint32 bc = lumacoeff & 0xff; + uint32 gc = (lumacoeff >> 8) & 0xff; + uint32 rc = (lumacoeff >> 16) & 0xff; + + for (int i = 0; i < width - 1; i += 2) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + const uint8* luma1 = ((src_argb[4] * bc + src_argb[5] * gc + + src_argb[6] * rc) & 0x7F00u) + luma; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; + dst_argb[7] = src_argb[7]; + src_argb += 8; + dst_argb += 8; + } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } +} + +void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) { + for (int i = 0; i < width - 1; i += 2) { + dst[3] = src[3]; + dst[7] = src[7]; + dst += 8; + src += 8; + } + if (width & 1) { + dst[3] = src[3]; + } +} + +void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) { + for (int i = 0; i < width - 1; i += 2) { + dst[3] = src[0]; + dst[7] = src[1]; + dst += 8; + src += 2; + } + if (width & 1) { + dst[3] = src[0]; + } +} #ifdef __cplusplus } // extern "C" diff --git a/chromium/third_party/libyuv/source/row_mips.cc b/chromium/third_party/libyuv/source/row_mips.cc index 69677aa2d5b..4435c55c5ce 100644 --- a/chromium/third_party/libyuv/source/row_mips.cc +++ b/chromium/third_party/libyuv/source/row_mips.cc @@ -15,6 +15,9 @@ namespace libyuv { extern "C" { #endif +// The following are available on Mips platforms: +#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) + #ifdef HAS_COPYROW_MIPS void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { __asm__ __volatile__ ( @@ -383,6 +386,7 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "blez $t4, 2f \n" " andi %[width], %[width], 0xf \n" // residual + ".p2align 2 \n" "1: \n" "addiu $t4, $t4, -1 \n" "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 @@ -449,6 +453,7 @@ void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, "blez $t4, 2f \n" " andi %[width], %[width], 0xf \n" // residual + ".p2align 2 \n" "1: \n" "addiu $t4, $t4, -1 \n" "lwr $t0, 0(%[src_uv]) \n" @@ -532,7 +537,8 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { "blez $t4, 2f \n" " addu %[src], %[src], %[width] \n" // src += width - "1: \n" + ".p2align 2 \n" + "1: \n" "lw $t0, -16(%[src]) \n" // |3|2|1|0| "lw $t1, -12(%[src]) \n" // |7|6|5|4| "lw $t2, -8(%[src]) \n" // |11|10|9|8| @@ -556,7 +562,7 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { "beqz $t5, 3f \n" " nop \n" - "2: \n" + "2: \n" "lbu $t0, -1(%[src]) \n" "addiu $t5, $t5, -1 \n" "addiu %[src], %[src], -1 \n" @@ -564,7 +570,7 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { "bgez $t5, 2b \n" " addiu %[dst], %[dst], 1 \n" - "3: \n" + "3: \n" ".set pop \n" : [src] "+r" (src), [dst] "+r" (dst) : [width] "r" (width) @@ -586,7 +592,8 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "blez %[x], 2f \n" " addu %[src_uv], %[src_uv], $t4 \n" - "1: \n" + ".p2align 2 \n" + "1: \n" "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| @@ -638,7 +645,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "b 2f \n" " nop \n" - "2: \n" + "2: \n" "lbu $t0, -2(%[src_uv]) \n" "lbu $t1, -1(%[src_uv]) \n" "addiu %[src_uv], %[src_uv], -2 \n" @@ -649,7 +656,7 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "bgtz %[y], 2b \n" " addiu %[dst_v], %[dst_v], 1 \n" - "3: \n" + "3: \n" ".set pop \n" : [src_uv] "+r" (src_uv), [dst_u] "+r" (dst_u), @@ -670,62 +677,62 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, // t2 = | 0 | R0 | 0 | r0 | // t1 = | 0 | R1 | 0 | r1 | #define I422ToTransientMipsRGB \ - "lw $t0, 0(%[y_buf]) \n" \ - "lhu $t1, 0(%[u_buf]) \n" \ - "lhu $t2, 0(%[v_buf]) \n" \ - "preceu.ph.qbr $t1, $t1 \n" \ - "preceu.ph.qbr $t2, $t2 \n" \ - "preceu.ph.qbra $t3, $t0 \n" \ - "preceu.ph.qbla $t0, $t0 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t3, $t3, $s4 \n" \ - "subu.ph $t0, $t0, $s4 \n" \ - "mul.ph $t3, $t3, $s0 \n" \ - "mul.ph $t0, $t0, $s0 \n" \ - "shll.ph $t4, $t1, 0x7 \n" \ - "subu.ph $t4, $t4, $t1 \n" \ - "mul.ph $t6, $t1, $s1 \n" \ - "mul.ph $t1, $t2, $s2 \n" \ - "addq_s.ph $t5, $t4, $t3 \n" \ - "addq_s.ph $t4, $t4, $t0 \n" \ - "shra.ph $t5, $t5, 6 \n" \ - "shra.ph $t4, $t4, 6 \n" \ - "addiu %[u_buf], 2 \n" \ - "addiu %[v_buf], 2 \n" \ - "addu.ph $t6, $t6, $t1 \n" \ - "mul.ph $t1, $t2, $s3 \n" \ - "addu.ph $t9, $t6, $t3 \n" \ - "addu.ph $t8, $t6, $t0 \n" \ - "shra.ph $t9, $t9, 6 \n" \ - "shra.ph $t8, $t8, 6 \n" \ - "addu.ph $t2, $t1, $t3 \n" \ - "addu.ph $t1, $t1, $t0 \n" \ - "shra.ph $t2, $t2, 6 \n" \ - "shra.ph $t1, $t1, 6 \n" \ - "subu.ph $t5, $t5, $s5 \n" \ - "subu.ph $t4, $t4, $s5 \n" \ - "subu.ph $t9, $t9, $s5 \n" \ - "subu.ph $t8, $t8, $s5 \n" \ - "subu.ph $t2, $t2, $s5 \n" \ - "subu.ph $t1, $t1, $s5 \n" \ - "shll_s.ph $t5, $t5, 8 \n" \ - "shll_s.ph $t4, $t4, 8 \n" \ - "shll_s.ph $t9, $t9, 8 \n" \ - "shll_s.ph $t8, $t8, 8 \n" \ - "shll_s.ph $t2, $t2, 8 \n" \ - "shll_s.ph $t1, $t1, 8 \n" \ - "shra.ph $t5, $t5, 8 \n" \ - "shra.ph $t4, $t4, 8 \n" \ - "shra.ph $t9, $t9, 8 \n" \ - "shra.ph $t8, $t8, 8 \n" \ - "shra.ph $t2, $t2, 8 \n" \ - "shra.ph $t1, $t1, 8 \n" \ - "addu.ph $t5, $t5, $s5 \n" \ - "addu.ph $t4, $t4, $s5 \n" \ - "addu.ph $t9, $t9, $s5 \n" \ - "addu.ph $t8, $t8, $s5 \n" \ - "addu.ph $t2, $t2, $s5 \n" \ + "lw $t0, 0(%[y_buf]) \n" \ + "lhu $t1, 0(%[u_buf]) \n" \ + "lhu $t2, 0(%[v_buf]) \n" \ + "preceu.ph.qbr $t1, $t1 \n" \ + "preceu.ph.qbr $t2, $t2 \n" \ + "preceu.ph.qbra $t3, $t0 \n" \ + "preceu.ph.qbla $t0, $t0 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t3, $t3, $s4 \n" \ + "subu.ph $t0, $t0, $s4 \n" \ + "mul.ph $t3, $t3, $s0 \n" \ + "mul.ph $t0, $t0, $s0 \n" \ + "shll.ph $t4, $t1, 0x7 \n" \ + "subu.ph $t4, $t4, $t1 \n" \ + "mul.ph $t6, $t1, $s1 \n" \ + "mul.ph $t1, $t2, $s2 \n" \ + "addq_s.ph $t5, $t4, $t3 \n" \ + "addq_s.ph $t4, $t4, $t0 \n" \ + "shra.ph $t5, $t5, 6 \n" \ + "shra.ph $t4, $t4, 6 \n" \ + "addiu %[u_buf], 2 \n" \ + "addiu %[v_buf], 2 \n" \ + "addu.ph $t6, $t6, $t1 \n" \ + "mul.ph $t1, $t2, $s3 \n" \ + "addu.ph $t9, $t6, $t3 \n" \ + "addu.ph $t8, $t6, $t0 \n" \ + "shra.ph $t9, $t9, 6 \n" \ + "shra.ph $t8, $t8, 6 \n" \ + "addu.ph $t2, $t1, $t3 \n" \ + "addu.ph $t1, $t1, $t0 \n" \ + "shra.ph $t2, $t2, 6 \n" \ + "shra.ph $t1, $t1, 6 \n" \ + "subu.ph $t5, $t5, $s5 \n" \ + "subu.ph $t4, $t4, $s5 \n" \ + "subu.ph $t9, $t9, $s5 \n" \ + "subu.ph $t8, $t8, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "shll_s.ph $t5, $t5, 8 \n" \ + "shll_s.ph $t4, $t4, 8 \n" \ + "shll_s.ph $t9, $t9, 8 \n" \ + "shll_s.ph $t8, $t8, 8 \n" \ + "shll_s.ph $t2, $t2, 8 \n" \ + "shll_s.ph $t1, $t1, 8 \n" \ + "shra.ph $t5, $t5, 8 \n" \ + "shra.ph $t4, $t4, 8 \n" \ + "shra.ph $t9, $t9, 8 \n" \ + "shra.ph $t8, $t8, 8 \n" \ + "shra.ph $t2, $t2, 8 \n" \ + "shra.ph $t1, $t1, 8 \n" \ + "addu.ph $t5, $t5, $s5 \n" \ + "addu.ph $t4, $t4, $s5 \n" \ + "addu.ph $t9, $t9, $s5 \n" \ + "addu.ph $t8, $t8, $s5 \n" \ + "addu.ph $t2, $t2, $s5 \n" \ "addu.ph $t1, $t1, $s5 \n" void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, @@ -745,7 +752,9 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, "repl.ph $s5, 128 \n" // |128|128| // clipping "lui $s6, 0xff00 \n" "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| - "1: \n" + + ".p2align 2 \n" + "1: \n" I422ToTransientMipsRGB // Arranging into argb format "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| @@ -773,7 +782,7 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, "sw $t3, 12(%[rgb_buf]) \n" "bnez %[width], 1b \n" " addiu %[rgb_buf], 16 \n" - "2: \n" + "2: \n" ".set pop \n" :[y_buf] "+r" (y_buf), [u_buf] "+r" (u_buf), @@ -794,47 +803,49 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, uint8* rgb_buf, int width) { __asm__ __volatile__ ( - ".set push \n\t" - ".set noreorder \n\t" - "beqz %[width], 2f \n\t" - " repl.ph $s0, 74 \n\t" // |YG|YG| = |74|74| - "repl.ph $s1, -25 \n\t" // |UG|UG| = |-25|-25| - "repl.ph $s2, -52 \n\t" // |VG|VG| = |-52|-52| - "repl.ph $s3, 102 \n\t" // |VR|VR| = |102|102| - "repl.ph $s4, 16 \n\t" // |0|16|0|16| - "repl.ph $s5, 128 \n\t" // |128|128| - "lui $s6, 0xff00 \n\t" - "ori $s6, 0xff00 \n\t" // |ff|00|ff|00| - "1: \n" + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00| + + ".p2align 2 \n" + "1: \n" I422ToTransientMipsRGB // Arranging into abgr format - "precr.qb.ph $t0, $t8, $t1 \n\t" // |G1|g1|R1|r1| - "precr.qb.ph $t3, $t9, $t2 \n\t" // |G0|g0|R0|r0| - "precrq.qb.ph $t8, $t0, $t3 \n\t" // |G1|R1|G0|R0| - "precr.qb.ph $t9, $t0, $t3 \n\t" // |g1|r1|g0|r0| - - "precr.qb.ph $t2, $t4, $t5 \n\t" // |B1|b1|B0|b0| - "addiu %[width], -4 \n\t" - "addiu %[y_buf], 4 \n\t" - "preceu.ph.qbla $t1, $t2 \n\t" // |0 |B1|0 |B0| - "preceu.ph.qbra $t2, $t2 \n\t" // |0 |b1|0 |b0| - "or $t1, $t1, $s6 \n\t" // |ff|B1|ff|B0| - "or $t2, $t2, $s6 \n\t" // |ff|b1|ff|b0| - "precrq.ph.w $t0, $t2, $t9 \n\t" // |ff|b1|g1|r1| - "precrq.ph.w $t3, $t1, $t8 \n\t" // |ff|B1|G1|R1| - "sll $t9, $t9, 16 \n\t" - "sll $t8, $t8, 16 \n\t" - "packrl.ph $t2, $t2, $t9 \n\t" // |ff|b0|g0|r0| - "packrl.ph $t1, $t1, $t8 \n\t" // |ff|B0|G0|R0| + "precr.qb.ph $t0, $t8, $t1 \n" // |G1|g1|R1|r1| + "precr.qb.ph $t3, $t9, $t2 \n" // |G0|g0|R0|r0| + "precrq.qb.ph $t8, $t0, $t3 \n" // |G1|R1|G0|R0| + "precr.qb.ph $t9, $t0, $t3 \n" // |g1|r1|g0|r0| + + "precr.qb.ph $t2, $t4, $t5 \n" // |B1|b1|B0|b0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |B1|0 |B0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |b1|0 |b0| + "or $t1, $t1, $s6 \n" // |ff|B1|ff|B0| + "or $t2, $t2, $s6 \n" // |ff|b1|ff|b0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|b1|g1|r1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|B1|G1|R1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|b0|g0|r0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|B0|G0|R0| // Store results. - "sw $t2, 0(%[rgb_buf]) \n\t" - "sw $t0, 4(%[rgb_buf]) \n\t" - "sw $t1, 8(%[rgb_buf]) \n\t" - "sw $t3, 12(%[rgb_buf]) \n\t" - "bnez %[width], 1b \n\t" - " addiu %[rgb_buf], 16 \n\t" - "2: \n\t" - ".set pop \n\t" + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" :[y_buf] "+r" (y_buf), [u_buf] "+r" (u_buf), [v_buf] "+r" (v_buf), @@ -865,13 +876,15 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, "repl.ph $s5, 128 \n" // |128|128| "lui $s6, 0xff \n" "ori $s6, 0xff \n" // |00|ff|00|ff| - "1: \n" + + ".p2align 2 \n" + "1: \n" I422ToTransientMipsRGB // Arranging into bgra format - "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1| - "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0| - "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0| - "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0| + "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1| + "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0| + "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0| + "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0| "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| "addiu %[width], -4 \n" @@ -895,7 +908,7 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, "sw $t3, 12(%[rgb_buf]) \n" "bnez %[width], 1b \n" " addiu %[rgb_buf], 16 \n" - "2: \n" + "2: \n" ".set pop \n" :[y_buf] "+r" (y_buf), [u_buf] "+r" (u_buf), @@ -923,6 +936,8 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, "replv.ph $t0, %[y0_fraction] \n" "replv.ph $t1, %[source_y_fraction] \n" + + ".p2align 2 \n" "1: \n" "lw $t2, 0(%[src_ptr]) \n" "lw $t3, 0(%[src_ptr1]) \n" @@ -968,6 +983,8 @@ void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, } #endif // __mips_dsp_rev >= 2 +#endif // defined(__mips__) + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/chromium/third_party/libyuv/source/row_neon.cc b/chromium/third_party/libyuv/source/row_neon.cc index 0bb55e717be..5e802194b2b 100644 --- a/chromium/third_party/libyuv/source/row_neon.cc +++ b/chromium/third_party/libyuv/source/row_neon.cc @@ -102,10 +102,10 @@ extern "C" { "vtrn.u8 d16, d17 \n" \ "vmov.u8 d21, d16 \n" -static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, - 0, 0, 0, 0, 0, 0, 0, 0 }; -static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, - 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, + 0, 0, 0, 0, 0, 0, 0, 0 }; +static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, + 0, 0, 0, 0, 0, 0, 0, 0 }; void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -118,7 +118,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV444 YUV422TORGB @@ -149,7 +149,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -180,7 +180,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV411 YUV422TORGB @@ -211,7 +211,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -243,7 +243,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -275,7 +275,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -306,7 +306,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -336,7 +336,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -379,7 +379,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -425,7 +425,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -467,7 +467,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -497,7 +497,7 @@ void YToARGBRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUV400 YUV422TORGB @@ -519,7 +519,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "vmov.u8 d23, #255 \n" "1: \n" "vld1.8 {d20}, [%0]! \n" @@ -546,7 +546,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READNV12 YUV422TORGB @@ -575,7 +575,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READNV21 YUV422TORGB @@ -604,7 +604,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READNV12 YUV422TORGB @@ -633,7 +633,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READNV21 YUV422TORGB @@ -661,7 +661,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READYUY2 YUV422TORGB @@ -688,7 +688,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" READUYVY YUV422TORGB @@ -710,7 +710,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop @@ -730,7 +730,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q1}, [%1]! \n" // load V @@ -750,7 +750,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop @@ -796,7 +796,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2 \n" "sub %0, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #16 \n" // 16 pixels per loop. @@ -820,7 +820,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "add %0, %0, %3, lsl #1 \n" "sub %0, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %3, #8 \n" // 8 pixels per loop. @@ -844,7 +844,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2, lsl #2 \n" "sub %0, #16 \n" - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #4 \n" // 4 pixels per loop. @@ -863,7 +863,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -880,7 +880,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -910,7 +910,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -956,7 +956,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -985,7 +985,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1002,7 +1002,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1018,7 +1018,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1035,7 +1035,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. @@ -1051,7 +1051,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. @@ -1068,7 +1068,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. @@ -1087,7 +1087,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. @@ -1107,7 +1107,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // stride + src_yuy2 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. @@ -1131,7 +1131,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // stride + src_uyvy - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. @@ -1193,6 +1193,23 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, ); } +// Select G channels from ARGB. e.g. GGGGGGGG +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /*selector*/, int pix) { + asm volatile ( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d1}, [%1]! \n" // store 8 G's. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { @@ -1218,7 +1235,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_yuy2, int width) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld1.8 {d1}, [%1]! \n" // load 8 Us @@ -1241,7 +1258,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_uyvy, int width) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld1.8 {d0}, [%1]! \n" // load 8 Us @@ -1261,7 +1278,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1279,7 +1296,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, int pix) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1298,7 +1315,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, int pix) { asm volatile ( "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1319,7 +1336,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1343,7 +1360,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1371,7 +1388,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -1410,7 +1427,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -1456,7 +1473,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -1521,7 +1538,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -1560,7 +1577,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #20 / 4 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 4 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -1598,7 +1615,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. @@ -1636,7 +1653,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. @@ -1674,7 +1691,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. @@ -1712,7 +1729,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. @@ -1750,7 +1767,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. @@ -1789,7 +1806,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB @@ -1849,7 +1866,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB @@ -1909,7 +1926,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB @@ -1964,7 +1981,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1990,7 +2007,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2016,7 +2033,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2042,7 +2059,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2067,7 +2084,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2092,7 +2109,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2117,7 +2134,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2142,7 +2159,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2412,7 +2429,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -2447,7 +2464,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d30, #50 \n" // BR coefficient - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. @@ -2474,18 +2491,19 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { } // Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, - int width) { +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { asm volatile ( - "vld1.8 {q2}, [%2] \n" // load 3 ARGB vectors. + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" - "vld4.8 {d16, d18, d20, d22}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit "vmovl.u8 q9, d18 \n" // g "vmovl.u8 q10, d20 \n" // r @@ -2493,33 +2511,42 @@ void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqshrun.s16 d16, q12, #7 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #7 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #7 \n" // 16 bit to 8 bit R - "vst4.8 {d16, d18, d20, d22}, [%0]! \n" // store 8 ARGB pixels. - "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(matrix_argb) // %2 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9", + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -2531,7 +2558,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. @@ -2562,7 +2589,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. @@ -2586,7 +2613,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. @@ -2615,7 +2642,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "vld1.8 {d1}, [%1]! \n" // load 8 sobely. @@ -2627,8 +2654,30 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Adds Sobel X and Sobel Y and stores Sobel into plane. +void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + // 16 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + "vld1.8 {q1}, [%1]! \n" // load 16 sobely. + "subs %3, %3, #16 \n" // 16 processed per loop. + "vqadd.u8 q0, q0, q1 \n" // add + "vst1.8 {q0}, [%2]! \n" // store 16 pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 : : "cc", "memory", "q0", "q1" ); @@ -2644,7 +2693,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "vld1.8 {d0}, [%1]! \n" // load 8 sobely. @@ -2668,7 +2717,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d1}, [%0],%6 \n" @@ -2705,7 +2754,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { asm volatile ( - ".p2align 2 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d1}, [%1],%4 \n" diff --git a/chromium/third_party/libyuv/source/row_posix.cc b/chromium/third_party/libyuv/source/row_posix.cc index b92a9f5c13b..539d871535b 100644 --- a/chromium/third_party/libyuv/source/row_posix.cc +++ b/chromium/third_party/libyuv/source/row_posix.cc @@ -10,155 +10,292 @@ #include "libyuv/row.h" -#include "libyuv/basic_types.h" - #ifdef __cplusplus namespace libyuv { extern "C" { #endif -// This module is for GCC x86 and x64 +// This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -// GCC 4.2 on OSX has link error when passing static or const to inline. -// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. -#ifdef __APPLE__ -#define CONST +// TODO(nfullagar): For Native Client: When new toolchain becomes available, +// take advantage of bundle lock / unlock feature. This will reduce the amount +// of manual bundle alignment done below, and bundle alignment could even be +// moved into each macro that doesn't use %%nacl: such as MEMOPREG. Consider +// unmunging functions to reduce complex addressing modes. + +#if defined(__native_client__) && defined(__x86_64__) +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%q" #base ",%q" #index "," #scale ")" +#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15" +#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " %%" #reg ",(%%r15,%%r14)\n" +#define MEMOP(opcode, offset, base, index, scale) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14)" +#define BUNDLEALIGN ".p2align 5\n" #else -#define CONST static const +#define MEMACCESS(base) "(%" #base ")" +#define MEMACCESS2(offset, base) #offset "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%" #base ",%" #index "," #scale ")" +#define MEMMOVESTRING(s, d) +#define MEMSTORESTRING(reg, d) +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#define MEMOP(opcode, offset, base, index, scale) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale ")" +#define BUNDLEALIGN #endif -#ifdef HAS_ARGBTOYROW_SSSE3 +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB -CONST vec8 kARGBToY = { +static vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; // JPeg full range. -CONST vec8 kARGBToYJ = { +static vec8 kARGBToYJ = { 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 }; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) + +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) -CONST vec8 kARGBToU = { +static vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; -CONST vec8 kARGBToUJ = { +static vec8 kARGBToUJ = { 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 }; -CONST vec8 kARGBToV = { +static vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; -CONST vec8 kARGBToVJ = { +static vec8 kARGBToVJ = { -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 }; // Constants for BGRA -CONST vec8 kBGRAToY = { +static vec8 kBGRAToY = { 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 }; -CONST vec8 kBGRAToU = { +static vec8 kBGRAToU = { 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 }; -CONST vec8 kBGRAToV = { +static vec8 kBGRAToV = { 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 }; // Constants for ABGR -CONST vec8 kABGRToY = { +static vec8 kABGRToY = { 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 }; -CONST vec8 kABGRToU = { +static vec8 kABGRToU = { -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 }; -CONST vec8 kABGRToV = { +static vec8 kABGRToV = { 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 }; // Constants for RGBA. -CONST vec8 kRGBAToY = { +static vec8 kRGBAToY = { 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 }; -CONST vec8 kRGBAToU = { +static vec8 kRGBAToU = { 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 }; -CONST vec8 kRGBAToV = { +static vec8 kRGBAToV = { 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 }; -CONST uvec8 kAddY16 = { +static uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; -CONST vec16 kAddYJ64 = { +static vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; -CONST uvec8 kAddUV128 = { +static uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; -CONST uvec16 kAddUVJ128 = { +static uvec16 kAddUVJ128 = { 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u }; +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) + +#ifdef HAS_RGB24TOARGBROW_SSSE3 // Shuffle table for converting RGB24 to ARGB. -CONST uvec8 kShuffleMaskRGB24ToARGB = { +static uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. -CONST uvec8 kShuffleMaskRAWToARGB = { +static uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; // Shuffle table for converting ARGB to RGB24. -CONST uvec8 kShuffleMaskARGBToRGB24 = { +static uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u }; // Shuffle table for converting ARGB to RAW. -CONST uvec8 kShuffleMaskARGBToRAW = { +static uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u }; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 -CONST uvec8 kShuffleMaskARGBToRGB24_0 = { +static uvec8 kShuffleMaskARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u }; // Shuffle table for converting ARGB to RAW. -CONST uvec8 kShuffleMaskARGBToRAW_0 = { +static uvec8 kShuffleMaskARGBToRAW_0 = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u }; +#endif // HAS_RGB24TOARGBROW_SSSE3 + +#if defined(TESTING) && defined(__x86_64__) +void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { + asm volatile ( + ".p2align 5 \n" + "mov %%eax,%%eax \n" + "mov %%ebx,%%ebx \n" + "mov %%ecx,%%ecx \n" + "mov %%edx,%%edx \n" + "mov %%esi,%%esi \n" + "mov %%edi,%%edi \n" + "mov %%ebp,%%ebp \n" + "mov %%esp,%%esp \n" + ".p2align 5 \n" + "mov %%r8d,%%r8d \n" + "mov %%r9d,%%r9d \n" + "mov %%r10d,%%r10d \n" + "mov %%r11d,%%r11d \n" + "mov %%r12d,%%r12d \n" + "mov %%r13d,%%r13d \n" + "mov %%r14d,%%r14d \n" + "mov %%r15d,%%r15d \n" + ".p2align 5 \n" + "lea (%%rax),%%eax \n" + "lea (%%rbx),%%ebx \n" + "lea (%%rcx),%%ecx \n" + "lea (%%rdx),%%edx \n" + "lea (%%rsi),%%esi \n" + "lea (%%rdi),%%edi \n" + "lea (%%rbp),%%ebp \n" + "lea (%%rsp),%%esp \n" + ".p2align 5 \n" + "lea (%%r8),%%r8d \n" + "lea (%%r9),%%r9d \n" + "lea (%%r10),%%r10d \n" + "lea (%%r11),%%r11d \n" + "lea (%%r12),%%r12d \n" + "lea (%%r13),%%r13d \n" + "lea (%%r14),%%r14d \n" + "lea (%%r15),%%r15d \n" + + ".p2align 5 \n" + "lea 0x10(%%rax),%%eax \n" + "lea 0x10(%%rbx),%%ebx \n" + "lea 0x10(%%rcx),%%ecx \n" + "lea 0x10(%%rdx),%%edx \n" + "lea 0x10(%%rsi),%%esi \n" + "lea 0x10(%%rdi),%%edi \n" + "lea 0x10(%%rbp),%%ebp \n" + "lea 0x10(%%rsp),%%esp \n" + ".p2align 5 \n" + "lea 0x10(%%r8),%%r8d \n" + "lea 0x10(%%r9),%%r9d \n" + "lea 0x10(%%r10),%%r10d \n" + "lea 0x10(%%r11),%%r11d \n" + "lea 0x10(%%r12),%%r12d \n" + "lea 0x10(%%r13),%%r13d \n" + "lea 0x10(%%r14),%%r14d \n" + "lea 0x10(%%r15),%%r15d \n" + + ".p2align 5 \n" + "add 0x10,%%eax \n" + "add 0x10,%%ebx \n" + "add 0x10,%%ecx \n" + "add 0x10,%%edx \n" + "add 0x10,%%esi \n" + "add 0x10,%%edi \n" + "add 0x10,%%ebp \n" + "add 0x10,%%esp \n" + ".p2align 5 \n" + "add 0x10,%%r8d \n" + "add 0x10,%%r9d \n" + "add 0x10,%%r10d \n" + "add 0x10,%%r11d \n" + "add 0x10,%%r12d \n" + "add 0x10,%%r13d \n" + "add 0x10,%%r14d \n" + "add 0x10,%%r15d \n" + + ".p2align 2 \n" + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // TESTING +#ifdef HAS_I400TOARGBROW_SSE2 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_y), // %0 @@ -177,19 +314,19 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_y), // %0 @@ -202,36 +339,39 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, #endif ); } +#endif // HAS_I400TOARGBROW_SSE2 +#ifdef HAS_RGB24TOARGBROW_SSSE3 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" "movdqa %%xmm3,%%xmm2 \n" "palignr $0x8,%%xmm1,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2,0x20(%1) \n" + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1,0x10(%1) \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 @@ -249,30 +389,31 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" + "lea " MEMLEA(0x30,0) ",%0 \n" "movdqa %%xmm3,%%xmm2 \n" "palignr $0x8,%%xmm1,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa %%xmm2,0x20(%1) \n" + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" - "movdqa %%xmm1,0x10(%1) \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" "por %%xmm5,%%xmm3 \n" "sub $0x10,%2 \n" - "movdqa %%xmm3,0x30(%1) \n" - "lea 0x40(%1),%1 \n" + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 @@ -302,9 +443,10 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "psllw $0x8,%%xmm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pand %%xmm3,%%xmm1 \n" @@ -319,9 +461,10 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,(%1,%0,2) \n" - "movdqa %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -329,6 +472,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "+r"(pix) // %2 : : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif @@ -351,9 +497,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "psllw $0x8,%%xmm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "psllw $0x1,%%xmm1 \n" @@ -372,9 +519,10 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,(%1,%0,2) \n" - "movdqa %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -382,6 +530,9 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "+r"(pix) // %2 : : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif @@ -397,9 +548,10 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0x4,%%xmm5 \n" "sub %0,%1 \n" "sub %0,%1 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" "pand %%xmm4,%%xmm0 \n" "pand %%xmm5,%%xmm2 \n" @@ -412,9 +564,10 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%1,%0,2) \n" - "movdqa %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) + MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) + "lea " MEMLEA(0x10,0) ",%0 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -422,6 +575,9 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { "+r"(pix) // %2 : : "memory", "cc", "eax" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -431,13 +587,14 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { asm volatile ( "movdqa %3,%%xmm6 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "pshufb %%xmm6,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "pshufb %%xmm6,%%xmm2 \n" @@ -448,14 +605,14 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,%%xmm5 \n" "por %%xmm4,%%xmm0 \n" "pslldq $0x8,%%xmm5 \n" - "movdqa %%xmm0,(%1) \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "psrldq $0x8,%%xmm2 \n" "pslldq $0x4,%%xmm3 \n" "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm1,0x10(%1) \n" - "movdqa %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -472,13 +629,14 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { asm volatile ( "movdqa %3,%%xmm6 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "pshufb %%xmm6,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "pshufb %%xmm6,%%xmm2 \n" @@ -489,14 +647,14 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,%%xmm5 \n" "por %%xmm4,%%xmm0 \n" "pslldq $0x8,%%xmm5 \n" - "movdqa %%xmm0,(%1) \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" "por %%xmm5,%%xmm1 \n" "psrldq $0x8,%%xmm2 \n" "pslldq $0x4,%%xmm3 \n" "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm1,0x10(%1) \n" - "movdqa %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x30,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -519,9 +677,10 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0x5,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0xb,%%xmm5 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pslld $0x8,%%xmm0 \n" @@ -534,9 +693,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { "por %%xmm2,%%xmm1 \n" "por %%xmm1,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -560,9 +719,10 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "pslld $0xa,%%xmm6 \n" "pcmpeqb %%xmm7,%%xmm7 \n" "pslld $0xf,%%xmm7 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" @@ -578,9 +738,9 @@ void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { "por %%xmm3,%%xmm2 \n" "por %%xmm2,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMACCESS2(0x8,1) ",%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -600,9 +760,10 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "psllw $0xc,%%xmm4 \n" "movdqa %%xmm4,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" "pand %%xmm4,%%xmm1 \n" @@ -610,9 +771,9 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { "psrlq $0x8,%%xmm1 \n" "por %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -625,22 +786,24 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { #endif ); } +#endif // HAS_RGB24TOARGBROW_SSSE3 +#ifdef HAS_ARGBTOYROW_SSSE3 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" @@ -648,8 +811,8 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 @@ -663,74 +826,76 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { ); } -void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - ".p2align 4 \n" + "movdqa %3,%%xmm4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" - "paddw %%xmm5,%%xmm0 \n" - "paddw %%xmm5,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 - : "m"(kARGBToYJ), // %3 - "m"(kAddYJ64) // %4 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif ); } +#endif // HAS_ARGBTOYROW_SSSE3 -void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +#ifdef HAS_ARGBTOYJROW_SSSE3 +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + "movdqa %4,%%xmm5 \n" + ".p2align 2 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" - "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 - : "m"(kARGBToY), // %3 - "m"(kAddY16) // %4 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -742,17 +907,17 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "paddw %%xmm5,%%xmm0 \n" @@ -761,8 +926,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 @@ -775,7 +940,9 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { #endif ); } +#endif // HAS_ARGBTOYJROW_SSSE3 +#ifdef HAS_ARGBTOUVROW_SSSE3 // TODO(fbarchard): pass xmm constants to single block of assembly. // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, @@ -794,17 +961,19 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" - "pavgb (%0,%4,1),%%xmm0 \n" - "pavgb 0x10(%0,%4,1),%%xmm1 \n" - "pavgb 0x20(%0,%4,1),%%xmm2 \n" - "pavgb 0x30(%0,%4,1),%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -826,16 +995,20 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_argb)) + : "r"(static_cast<intptr_t>(src_stride_argb)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -856,17 +1029,19 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" - "pavgb (%0,%4,1),%%xmm0 \n" - "pavgb 0x10(%0,%4,1),%%xmm1 \n" - "pavgb 0x20(%0,%4,1),%%xmm2 \n" - "pavgb 0x30(%0,%4,1),%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -889,16 +1064,20 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_argb)) + : "r"(static_cast<intptr_t>(src_stride_argb)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -918,21 +1097,23 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu (%0,%4,1),%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -954,16 +1135,20 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_argb)) + : "r"(static_cast<intptr_t>(src_stride_argb)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -983,21 +1168,23 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu (%0,%4,1),%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1020,9 +1207,10 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1030,6 +1218,9 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, "+rm"(width) // %3 : "r"(static_cast<intptr_t>(src_stride_argb)) : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1049,12 +1240,13 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -1066,11 +1258,11 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm0 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm2 \n" @@ -1081,9 +1273,10 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "psraw $0x8,%%xmm2 \n" "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "lea 0x40(%0),%0 \n" - "movdqa %%xmm0,(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 @@ -1091,6 +1284,9 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "+rm"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6" #endif @@ -1110,12 +1306,13 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" @@ -1127,11 +1324,11 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm0 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm2 \n" @@ -1142,9 +1339,10 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, "psraw $0x8,%%xmm2 \n" "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" - "lea 0x40(%0),%0 \n" - "movdqu %%xmm0,(%1,%2,1) \n" - "lea 0x10(%1),%1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 @@ -1152,6 +1350,9 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, "+rm"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6" #endif @@ -1171,13 +1372,14 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1199,9 +1401,10 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1209,6 +1412,9 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, "+rm"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1228,13 +1434,14 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1256,9 +1463,10 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 @@ -1266,6 +1474,9 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, "+rm"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1276,17 +1487,18 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" @@ -1294,8 +1506,8 @@ void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 @@ -1313,17 +1525,18 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" @@ -1331,8 +1544,8 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 @@ -1359,17 +1572,19 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" - "pavgb (%0,%4,1),%%xmm0 \n" - "pavgb 0x10(%0,%4,1),%%xmm1 \n" - "pavgb 0x20(%0,%4,1),%%xmm2 \n" - "pavgb 0x30(%0,%4,1),%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1391,16 +1606,20 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_bgra)) + : "r"(static_cast<intptr_t>(src_stride_bgra)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1420,21 +1639,23 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu (%0,%4,1),%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1456,16 +1677,20 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_bgra0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_bgra)) + : "r"(static_cast<intptr_t>(src_stride_bgra)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1476,17 +1701,18 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" @@ -1494,8 +1720,8 @@ void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 @@ -1513,17 +1739,18 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" @@ -1531,8 +1758,8 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 @@ -1550,17 +1777,18 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" @@ -1568,8 +1796,8 @@ void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 @@ -1587,17 +1815,18 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" "movdqa %3,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" @@ -1605,8 +1834,8 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { "packuswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 @@ -1633,17 +1862,19 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" - "pavgb (%0,%4,1),%%xmm0 \n" - "pavgb 0x10(%0,%4,1),%%xmm1 \n" - "pavgb 0x20(%0,%4,1),%%xmm2 \n" - "pavgb 0x30(%0,%4,1),%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1665,16 +1896,20 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_abgr)) + : "r"(static_cast<intptr_t>(src_stride_abgr)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1694,21 +1929,23 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu (%0,%4,1),%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1730,16 +1967,20 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_abgr0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_abgr)) + : "r"(static_cast<intptr_t>(src_stride_abgr)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1759,17 +2000,19 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm6 \n" - "pavgb (%0,%4,1),%%xmm0 \n" - "pavgb 0x10(%0,%4,1),%%xmm1 \n" - "pavgb 0x20(%0,%4,1),%%xmm2 \n" - "pavgb 0x30(%0,%4,1),%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1791,9 +2034,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgba0), // %0 "+r"(dst_u), // %1 @@ -1801,6 +2045,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "+rm"(width) // %3 : "r"(static_cast<intptr_t>(src_stride_rgba)) : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif @@ -1820,21 +2067,23 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, ); asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm6 \n" - "movdqu (%0,%4,1),%%xmm7 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm0 \n" - "movdqu 0x10(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm1 \n" - "movdqu 0x20(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm2 \n" - "movdqu 0x30(%0,%4,1),%%xmm7 \n" + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 "pavgb %%xmm7,%%xmm6 \n" - "lea 0x40(%0),%0 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" @@ -1856,22 +2105,26 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movlps %%xmm0,(%1) \n" - "movhps %%xmm0,(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" + "movlps %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_rgba0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 - : "r"(static_cast<intptr_t>(src_stride_rgba)) + : "r"(static_cast<intptr_t>(src_stride_rgba)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" #endif ); } -#endif // HAS_ARGBTOYROW_SSSE3 +#endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ @@ -1901,7 +2154,7 @@ struct { vec8 kVUToB; // 128 vec8 kVUToG; // 144 vec8 kVUToR; // 160 -} CONST SIMD_ALIGNED(kYuvConstants) = { +} static SIMD_ALIGNED(kYuvConstants) = { { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, @@ -1918,83 +2171,86 @@ struct { // Read 8 UV from 411 #define READYUV444 \ - "movq (%[u_buf]),%%xmm0 \n" \ - "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x8(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ + "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" // Read 4 UV from 422, upsample to 8 UV #define READYUV422 \ - "movd (%[u_buf]),%%xmm0 \n" \ - "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x4(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" // Read 2 UV from 411, upsample to 8 UV #define READYUV411 \ - "movd (%[u_buf]),%%xmm0 \n" \ - "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ - "lea 0x2(%[u_buf]),%[u_buf] \n" \ - "punpcklbw %%xmm1,%%xmm0 \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ - "punpckldq %%xmm0,%%xmm0 \n" \ + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ + BUNDLEALIGN \ + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ + "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "punpckldq %%xmm0,%%xmm0 \n" // Read 4 UV from NV12, upsample to 8 UV #define READNV12 \ - "movq (%[uv_buf]),%%xmm0 \n" \ - "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ - "punpcklwd %%xmm0,%%xmm0 \n" \ + "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ + "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \ - "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \ - "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \ - "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ - "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ - "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ - "movq (%[y_buf]),%%xmm3 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" \ - "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ - "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ - "paddsw %%xmm3,%%xmm0 \n" \ - "paddsw %%xmm3,%%xmm1 \n" \ - "paddsw %%xmm3,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" // Convert 8 pixels: 8 VU and 8 Y #define YVUTORGB \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \ - "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \ - "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \ - "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ - "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ - "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ - "movq (%[y_buf]),%%xmm3 \n" \ - "lea 0x8(%[y_buf]),%[y_buf] \n" \ - "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ - "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ - "paddsw %%xmm3,%%xmm0 \n" \ - "paddsw %%xmm3,%%xmm1 \n" \ - "paddsw %%xmm3,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ + "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ + "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2005,7 +2261,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV444 YUVTORGB @@ -2014,9 +2270,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%[dst_argb]) \n" - "movdqa %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2026,6 +2282,9 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2053,7 +2312,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, #endif "sub %[u_buf],%[v_buf] \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2065,9 +2324,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_rgb24]) \n" - "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" - "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" + "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2081,6 +2340,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) #endif : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" #endif @@ -2108,7 +2370,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, #endif "sub %[u_buf],%[v_buf] \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2120,9 +2382,9 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_raw]) \n" - "movdqu %%xmm1,0x8(%[dst_raw]) \n" - "lea 0x18(%[dst_raw]),%[dst_raw] \n" + "movq %%xmm0," MEMACCESS([dst_raw]) " \n" + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" + "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2136,6 +2398,9 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) #endif : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" #endif @@ -2151,7 +2416,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2160,9 +2425,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%[dst_argb]) \n" - "movdqa %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2172,6 +2437,9 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2187,7 +2455,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV411 YUVTORGB @@ -2196,9 +2464,9 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%[dst_argb]) \n" - "movdqa %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2208,6 +2476,9 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2221,7 +2492,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READNV12 YUVTORGB @@ -2230,9 +2501,9 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%[dst_argb]) \n" - "movdqa %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2241,6 +2512,7 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" + // Does not use r14. #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2254,7 +2526,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READNV12 YVUTORGB @@ -2263,9 +2535,9 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%[dst_argb]) \n" - "movdqa %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2274,6 +2546,7 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" + // Does not use r14. #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2289,7 +2562,7 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV444 YUVTORGB @@ -2298,9 +2571,9 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%[dst_argb]) \n" - "movdqu %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2310,6 +2583,9 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2325,7 +2601,7 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2334,9 +2610,9 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%[dst_argb]) \n" - "movdqu %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2346,6 +2622,9 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2361,7 +2640,7 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV411 YUVTORGB @@ -2370,9 +2649,9 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%[dst_argb]) \n" - "movdqu %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2382,6 +2661,9 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2395,7 +2677,7 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READNV12 YUVTORGB @@ -2404,9 +2686,9 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%[dst_argb]) \n" - "movdqu %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2415,6 +2697,7 @@ void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" + // Does not use r14. #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2428,7 +2711,7 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READNV12 YVUTORGB @@ -2437,9 +2720,9 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%[dst_argb]) \n" - "movdqu %%xmm1,0x10(%[dst_argb]) \n" - "lea 0x20(%[dst_argb]),%[dst_argb] \n" + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2448,6 +2731,7 @@ void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" + // Does not use r14. #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2463,7 +2747,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2473,9 +2757,9 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5,(%[dst_bgra]) \n" - "movdqa %%xmm0,0x10(%[dst_bgra]) \n" - "lea 0x20(%[dst_bgra]),%[dst_bgra] \n" + "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2485,6 +2769,9 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2500,7 +2787,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2509,9 +2796,9 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,(%[dst_abgr]) \n" - "movdqa %%xmm1,0x10(%[dst_abgr]) \n" - "lea 0x20(%[dst_abgr]),%[dst_abgr] \n" + "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2521,6 +2808,9 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2536,7 +2826,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2546,9 +2836,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5,(%[dst_rgba]) \n" - "movdqa %%xmm0,0x10(%[dst_rgba]) \n" - "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" + "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2558,6 +2848,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2573,7 +2866,7 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2583,9 +2876,9 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqu %%xmm5,(%[dst_bgra]) \n" - "movdqu %%xmm0,0x10(%[dst_bgra]) \n" - "lea 0x20(%[dst_bgra]),%[dst_bgra] \n" + "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2595,6 +2888,9 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2610,7 +2906,7 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2619,9 +2915,9 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,(%[dst_abgr]) \n" - "movdqu %%xmm1,0x10(%[dst_abgr]) \n" - "lea 0x20(%[dst_abgr]),%[dst_abgr] \n" + "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2631,6 +2927,9 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2646,7 +2945,7 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" READYUV422 YUVTORGB @@ -2656,9 +2955,9 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5,(%[dst_rgba]) \n" - "movdqa %%xmm0,0x10(%[dst_rgba]) \n" - "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" + "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -2668,6 +2967,9 @@ void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -2690,11 +2992,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, "mov $0x004a004a,%%eax \n" "movd %%eax,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "psubusw %%xmm3,%%xmm0 \n" "pmullw %%xmm2,%%xmm0 \n" @@ -2708,9 +3011,9 @@ void YToARGBRow_SSE2(const uint8* y_buf, "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm1 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,16(%1) \n" - "lea 32(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" @@ -2728,7 +3031,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -CONST uvec8 kShuffleMirror = { +static uvec8 kShuffleMirror = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; @@ -2736,20 +3039,24 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast<intptr_t>(width); asm volatile ( "movdqa %3,%%xmm5 \n" - "lea -0x10(%0),%0 \n" - ".p2align 4 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0,%2),%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : "m"(kShuffleMirror) // %3 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm5" #endif @@ -2761,10 +3068,11 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast<intptr_t>(width); asm volatile ( - "lea -0x10(%0),%0 \n" - ".p2align 4 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0,%2),%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 "movdqa %%xmm0,%%xmm1 \n" "psllw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" @@ -2773,14 +3081,17 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { "pshufhw $0x1b,%%xmm0,%%xmm0 \n" "pshufd $0x4e,%%xmm0,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1)",%1 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1" #endif @@ -2790,7 +3101,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_MIRRORROW_UV_SSSE3 // Shuffle table for reversing the bytes of UV channels. -CONST uvec8 kShuffleMirrorUV = { +static uvec8 kShuffleMirrorUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, @@ -2798,17 +3109,19 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, intptr_t temp_width = static_cast<intptr_t>(width); asm volatile ( "movdqa %4,%%xmm1 \n" - "lea -16(%0,%3,2),%0 \n" + "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "lea -16(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" "pshufb %%xmm1,%%xmm0 \n" "sub $8,%3 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,(%1,%2) \n" - "lea 8(%1),%1 \n" + "movlpd %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst_u), // %1 @@ -2816,6 +3129,9 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, "+r"(temp_width) // %3 : "m"(kShuffleMirrorUV) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1" #endif @@ -2825,22 +3141,23 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #ifdef HAS_ARGBMIRRORROW_SSSE3 // Shuffle table for reversing the bytes. -CONST uvec8 kARGBShuffleMirror = { +static uvec8 kARGBShuffleMirror = { 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u }; void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast<intptr_t>(width); asm volatile ( + "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" "movdqa %3,%%xmm5 \n" - "lea -0x10(%0),%0 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0,%2,4),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" + "lea " MEMLEA(-0x10,0) ",%0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -2860,11 +3177,12 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pand %%xmm5,%%xmm0 \n" @@ -2873,9 +3191,9 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm2,(%1,%2) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uv), // %0 @@ -2884,6 +3202,9 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "+r"(pix) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif @@ -2896,11 +3217,12 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pand %%xmm5,%%xmm0 \n" @@ -2909,9 +3231,9 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm2,(%1,%2) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) + "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uv), // %0 @@ -2920,6 +3242,9 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "+r"(pix) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif @@ -2932,17 +3257,18 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( "sub %0,%1 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm2 \n" - "movdqa %%xmm0,(%2) \n" - "movdqa %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_u), // %0 @@ -2951,6 +3277,9 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "+r"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2" #endif @@ -2961,17 +3290,18 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( "sub %0,%1 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" + "lea " MEMLEA(0x20,2) ",%2 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_u), // %0 @@ -2980,6 +3310,9 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, "+r"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2" #endif @@ -2990,14 +3323,14 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { asm volatile ( - "sub %0,%1 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa %%xmm0,(%0,%1) \n" - "movdqa %%xmm1,0x10(%0,%1) \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x20,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -3017,7 +3350,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { size_t width_tmp = static_cast<size_t>(width); asm volatile ( "shr $0x2,%2 \n" - "rep movsl \n" + "rep movsl " MEMMOVESTRING(0,1) " \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -3027,11 +3360,12 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { } #endif // HAS_COPYROW_X86 +#ifdef HAS_COPYROW_ERMS // Unaligned Multiple of 1. void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { size_t width_tmp = static_cast<size_t>(width); asm volatile ( - "rep movsb \n" + "rep movsb " MEMMOVESTRING(0,1) " \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -3039,13 +3373,156 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { : "memory", "cc" ); } +#endif // HAS_COPYROW_ERMS + +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + ".p2align 2 \n" + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa " MEMACCESS(1) ",%%xmm4 \n" + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + ".p2align 2 \n" + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "pcmpeqb %%xmm0,%%xmm0 \n" + "pslld $0x18,%%xmm0 \n" + "pcmpeqb %%xmm1,%%xmm1 \n" + "psrld $0x8,%%xmm1 \n" + ".p2align 2 \n" + "1: \n" + "movq " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpckhwd %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm2,%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm4 \n" + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" + "pand %%xmm0,%%xmm2 \n" + "pand %%xmm0,%%xmm3 \n" + "pand %%xmm1,%%xmm4 \n" + "pand %%xmm1,%%xmm5 \n" + "por %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + asm volatile ( + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrld $0x8,%%ymm0,%%ymm0 \n" + ".p2align 2 \n" + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" + "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "vpslld $0x18,%%ymm1,%%ymm1 \n" + "vpslld $0x18,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm1," MEMACCESS(1) " \n" + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 void SetRow_X86(uint8* dst, uint32 v32, int width) { size_t width_tmp = static_cast<size_t>(width); asm volatile ( "shr $0x2,%1 \n" - "rep stosl \n" + "rep stosl " MEMSTORESTRING(eax,0) " \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -3058,7 +3535,7 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, size_t width_tmp = static_cast<size_t>(width); uint32* d = reinterpret_cast<uint32*>(dst); asm volatile ( - "rep stosl \n" + "rep stosl " MEMSTORESTRING(eax,0) " \n" : "+D"(d), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -3073,16 +3550,17 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 @@ -3102,13 +3580,15 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%4,1),%%xmm2 \n" - "movdqa 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "psrlw $0x8,%%xmm0 \n" @@ -3119,9 +3599,10 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 @@ -3130,6 +3611,9 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "+r"(pix) // %3 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif @@ -3142,11 +3626,12 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" @@ -3155,9 +3640,10 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 @@ -3166,6 +3652,9 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, "+r"(pix) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" #endif @@ -3177,17 +3666,18 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -3207,13 +3697,15 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu (%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "psrlw $0x8,%%xmm0 \n" @@ -3224,9 +3716,10 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 @@ -3235,6 +3728,9 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, "+r"(pix) // %3 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif @@ -3247,11 +3743,12 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" @@ -3260,9 +3757,10 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 @@ -3271,6 +3769,9 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, "+r"(pix) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" #endif @@ -3279,17 +3780,18 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -3308,13 +3810,15 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%4,1),%%xmm2 \n" - "movdqa 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "pand %%xmm5,%%xmm0 \n" @@ -3325,9 +3829,10 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 @@ -3336,6 +3841,9 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "+r"(pix) // %3 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif @@ -3348,11 +3856,12 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" @@ -3361,9 +3870,10 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 @@ -3372,6 +3882,9 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, "+r"(pix) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" #endif @@ -3381,17 +3894,18 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -3410,13 +3924,15 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu (%0,%4,1),%%xmm2 \n" - "movdqu 0x10(%0,%4,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "pand %%xmm5,%%xmm0 \n" @@ -3427,9 +3943,10 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 @@ -3438,6 +3955,9 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "+r"(pix) // %3 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif @@ -3450,11 +3970,12 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" @@ -3463,9 +3984,10 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,(%1,%2) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) + "lea " MEMLEA(0x8,1) ",%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 @@ -3474,6 +3996,9 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, "+r"(pix) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm5" #endif @@ -3502,19 +4027,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "10: \n" "test $0xf,%2 \n" "je 19f \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -3523,8 +4048,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x1,%3 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" "jge 10b \n" "19: \n" @@ -3534,19 +4059,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. ".p2align 2 \n" "41: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -3555,8 +4080,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" "49: \n" @@ -3565,19 +4090,19 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, // 1 pixel loop. "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -3586,8 +4111,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x1,%3 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 @@ -3605,7 +4130,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. -CONST uvec8 kShuffleAlpha = { +static uvec8 kShuffleAlpha = { 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 }; @@ -3639,17 +4164,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "10: \n" "test $0xf,%2 \n" "je 19f \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -3658,8 +4183,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x1,%3 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" "jge 10b \n" "19: \n" @@ -3673,17 +4198,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 4 pixel loop. ".p2align 2 \n" "40: \n" - "movdqa (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movdqa (%1),%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqa (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -3692,25 +4217,25 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "jge 40b \n" "jmp 49f \n" // 4 pixel unaligned loop. ".p2align 2 \n" "41: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movdqu (%1),%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqu (%1),%%xmm1 \n" - "lea 0x10(%1),%1 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -3719,8 +4244,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqa %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "jge 41b \n" "49: \n" @@ -3729,17 +4254,17 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // 1 pixel loop. "91: \n" - "movd (%0),%%xmm3 \n" - "lea 0x4(%0),%0 \n" + "movd " MEMACCESS(0) ",%%xmm3 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movd (%1),%%xmm2 \n" + "movd " MEMACCESS(1) ",%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movd (%1),%%xmm1 \n" - "lea 0x4(%1),%1 \n" + "movd " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -3748,8 +4273,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x1,%3 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" "jge 91b \n" "99: \n" : "+r"(src_argb0), // %0 @@ -3770,26 +4295,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, // aligned to 16 bytes void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( - "sub %0,%1 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "pslld $0x18,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x8,%%xmm5 \n" // 4 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa (%0),%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm1,%%xmm1 \n" "pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" - "movdqa (%0),%%xmm2 \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "psrlw $0x8,%%xmm0 \n" "pand %%xmm4,%%xmm2 \n" "psrlw $0x8,%%xmm1 \n" @@ -3797,8 +4322,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3814,10 +4339,10 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha -CONST uvec8 kShuffleAlpha0 = { +static uvec8 kShuffleAlpha0 = { 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, }; -CONST uvec8 kShuffleAlpha1 = { +static uvec8 kShuffleAlpha1 = { 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; @@ -3825,34 +4350,34 @@ CONST uvec8 kShuffleAlpha1 = { // aligned to 16 bytes void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( - "sub %0,%1 \n" "pcmpeqb %%xmm3,%%xmm3 \n" "pslld $0x18,%%xmm3 \n" "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" // 4 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n" - "movdqa (%0),%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "punpcklbw %%xmm1,%%xmm1 \n" "pmulhuw %%xmm1,%%xmm0 \n" - "movdqa (%0),%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n" - "movdqa (%0),%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" "punpckhbw %%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" - "movdqa (%0),%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "pand %%xmm3,%%xmm2 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3874,35 +4399,34 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { uintptr_t alpha = 0; asm volatile ( - "sub %0,%1 \n" - // 4 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movzb 0x3(%0),%3 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movzb " MEMACCESS2(0x03,0) ",%3 \n" "punpcklbw %%xmm0,%%xmm0 \n" - "movd 0x0(%4,%3,4),%%xmm2 \n" - "movzb 0x7(%0),%3 \n" - "movd 0x0(%4,%3,4),%%xmm3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 "pshuflw $0x40,%%xmm2,%%xmm2 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n" "movlhps %%xmm3,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa (%0),%%xmm1 \n" - "movzb 0xb(%0),%3 \n" + "movdqu " MEMACCESS(0) ",%%xmm1 \n" + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" "punpckhbw %%xmm1,%%xmm1 \n" - "movd 0x0(%4,%3,4),%%xmm2 \n" - "movzb 0xf(%0),%3 \n" - "movd 0x0(%4,%3,4),%%xmm3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 "pshuflw $0x40,%%xmm2,%%xmm2 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n" "movlhps %%xmm3,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3910,6 +4434,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, "+r"(alpha) // %3 : "r"(fixed_invtbl8) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -3923,21 +4450,21 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - "sub %0,%1 \n" // 8 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa (%0),%%xmm2 \n" - "movdqa 0x10(%0),%%xmm3 \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "psrld $0x18,%%xmm2 \n" "psrld $0x18,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" @@ -3949,9 +4476,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "movdqa %%xmm1,0x10(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3971,15 +4498,15 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone -CONST vec8 kARGBToSepiaB = { +static vec8 kARGBToSepiaB = { 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 }; -CONST vec8 kARGBToSepiaG = { +static vec8 kARGBToSepiaG = { 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 }; -CONST vec8 kARGBToSepiaR = { +static vec8 kARGBToSepiaR = { 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 }; @@ -3991,32 +4518,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "movdqa %4,%%xmm4 \n" // 8 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm6 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" "phaddw %%xmm6,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm5 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm5 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" - "movdqa (%0),%%xmm6 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" @@ -4026,9 +4553,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" "sub $0x8,%1 \n" - "movdqa %%xmm0,(%0) \n" - "movdqa %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -4046,62 +4573,64 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, - int width) { +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { asm volatile ( - "movd (%2),%%xmm2 \n" - "movd 0x4(%2),%%xmm3 \n" - "movd 0x8(%2),%%xmm4 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" + "movdqu " MEMACCESS(3) ",%%xmm5 \n" + "pshufd $0x00,%%xmm5,%%xmm2 \n" + "pshufd $0x55,%%xmm5,%%xmm3 \n" + "pshufd $0xaa,%%xmm5,%%xmm4 \n" + "pshufd $0xff,%%xmm5,%%xmm5 \n" // 8 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm6 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm6,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm5 \n" - "psraw $0x7,%%xmm0 \n" - "psraw $0x7,%%xmm5 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddsw %%xmm1,%%xmm5 \n" - "psraw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" - "movdqa (%0),%%xmm6 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" "packuswb %%xmm6,%%xmm6 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "sub $0x8,%1 \n" - "movdqa %%xmm0,(%0) \n" - "movdqa %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(matrix_argb) // %2 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif ); } @@ -4129,14 +4658,14 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, // 4 pixel loop. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "movdqa (%0),%%xmm1 \n" + "movdqa " MEMACCESS(0) ",%%xmm1 \n" "punpckhbw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n" "pmullw %%xmm3,%%xmm0 \n" - "movdqa (%0),%%xmm7 \n" + "movdqa " MEMACCESS(0) ",%%xmm7 \n" "pmullw %%xmm3,%%xmm1 \n" "pand %%xmm6,%%xmm7 \n" "paddw %%xmm4,%%xmm0 \n" @@ -4144,8 +4673,8 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" "sub $0x4,%1 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -4167,14 +4696,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( "movd %3,%%xmm2 \n" - "sub %0,%1 \n" "punpcklbw %%xmm2,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm2 \n" // 4 pixel loop. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" @@ -4184,8 +4713,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -4205,14 +4734,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( "pxor %%xmm5,%%xmm5 \n" - "sub %0,%1 \n" - "sub %0,%2 \n" // 4 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%1),%%xmm2 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm2 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "movdqu %%xmm0,%%xmm1 \n" "movdqu %%xmm2,%%xmm3 \n" "punpcklbw %%xmm0,%%xmm0 \n" @@ -4223,8 +4752,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "pmulhuw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqu %%xmm0,(%0,%2,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4244,18 +4773,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - // 4 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%1),%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqu %%xmm0,(%0,%2,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4275,18 +4803,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "sub %0,%1 \n" - "sub %0,%2 \n" - // 4 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%1),%%xmm1 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "psubusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" - "movdqu %%xmm0,(%0,%2,1) \n" - "lea 0x10(%0),%0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "jg 1b \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -4301,13 +4828,13 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBSUBTRACTROW_SSE2 -#ifdef HAS_SOBELXROW_SSSE3 +#ifdef HAS_SOBELXROW_SSE2 // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 -void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { asm volatile ( "sub %0,%1 \n" "sub %0,%2 \n" @@ -4315,31 +4842,37 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq 0x2(%0),%%xmm1 \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" "punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm1 \n" "psubw %%xmm1,%%xmm0 \n" - "movq (%0,%1,1),%%xmm1 \n" - "movq 0x2(%0,%1,1),%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 + MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 "punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n" "psubw %%xmm2,%%xmm1 \n" - "movq (%0,%2,1),%%xmm2 \n" - "movq 0x2(%0,%2,1),%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 + MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 "punpcklbw %%xmm5,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm3 \n" "psubw %%xmm3,%%xmm2 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" - "pabsw %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "sub $0x8,%4 \n" - "movq %%xmm0,(%0,%3,1) \n" - "lea 0x8(%0),%0 \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) + "lea " MEMLEA(0x8,0) ",%0 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -4348,51 +4881,60 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %4 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif ); } -#endif // HAS_SOBELXROW_SSSE3 +#endif // HAS_SOBELXROW_SSE2 -#ifdef HAS_SOBELYROW_SSSE3 +#ifdef HAS_SOBELYROW_SSE2 // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 -void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { asm volatile ( "sub %0,%1 \n" "sub %0,%2 \n" "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%1,1),%%xmm1 \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 "punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm1 \n" "psubw %%xmm1,%%xmm0 \n" - "movq 0x1(%0),%%xmm1 \n" - "movq 0x1(%0,%1,1),%%xmm2 \n" + BUNDLEALIGN + "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" + MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 "punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n" "psubw %%xmm2,%%xmm1 \n" - "movq 0x2(%0),%%xmm2 \n" - "movq 0x2(%0,%1,1),%%xmm3 \n" + BUNDLEALIGN + "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" + MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 "punpcklbw %%xmm5,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm3 \n" "psubw %%xmm3,%%xmm2 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" - "pabsw %%xmm0,%%xmm0 \n" + "pxor %%xmm1,%%xmm1 \n" + "psubw %%xmm0,%%xmm1 \n" + "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "sub $0x8,%3 \n" - "movq %%xmm0,(%0,%2,1) \n" - "lea 0x8(%0),%0 \n" + BUNDLEALIGN + MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) + "lea " MEMLEA(0x8,0) ",%0 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -4400,12 +4942,15 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, "+r"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif ); } -#endif // HAS_SOBELYROW_SSSE3 +#endif // HAS_SOBELYROW_SSE2 #ifdef HAS_SOBELROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -4414,18 +4959,19 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, // G = Sobel // B = Sobel void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { asm volatile ( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" // 8 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm2 \n" @@ -4441,11 +4987,11 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm0 \n" "sub $0x10,%3 \n" - "movdqa %%xmm1,(%2) \n" - "movdqa %%xmm2,0x10(%2) \n" - "movdqa %%xmm3,0x20(%2) \n" - "movdqa %%xmm0,0x30(%2) \n" - "lea 0x40(%2),%2 \n" + "movdqa %%xmm1," MEMACCESS(2) " \n" + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" + "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" + "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -4453,6 +4999,9 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "+r"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif @@ -4460,6 +5009,43 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, } #endif // HAS_SOBELROW_SSE2 +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_y), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + #ifdef HAS_SOBELXYROW_SSE2 // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 @@ -4473,11 +5059,12 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm2 \n" "paddusb %%xmm1,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" @@ -4493,11 +5080,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "punpcklwd %%xmm0,%%xmm7 \n" "punpckhwd %%xmm0,%%xmm1 \n" "sub $0x10,%3 \n" - "movdqa %%xmm6,(%2) \n" - "movdqa %%xmm4,0x10(%2) \n" - "movdqa %%xmm7,0x20(%2) \n" - "movdqa %%xmm1,0x30(%2) \n" - "lea 0x40(%2),%2 \n" + "movdqa %%xmm6," MEMACCESS(2) " \n" + "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" + "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" + "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" + "lea " MEMLEA(0x40,2) ",%2 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -4505,6 +5092,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, "+r"(width) // %3 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif @@ -4518,7 +5108,6 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, const int32* previous_cumsum, int width) { asm volatile ( - "sub %1,%2 \n" "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "sub $0x4,%3 \n" @@ -4529,8 +5118,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, // 4 pixel loop \n" ".p2align 2 \n" "40: \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm2,%%xmm4 \n" "punpcklbw %%xmm1,%%xmm2 \n" "movdqa %%xmm2,%%xmm3 \n" @@ -4541,22 +5130,23 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, "punpcklwd %%xmm1,%%xmm4 \n" "punpckhwd %%xmm1,%%xmm5 \n" "paddd %%xmm2,%%xmm0 \n" - "movdqa (%1,%2,1),%%xmm2 \n" + "movdqa " MEMACCESS(2) ",%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm0 \n" - "movdqa 0x10(%1,%2,1),%%xmm3 \n" + "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" "paddd %%xmm0,%%xmm3 \n" "paddd %%xmm4,%%xmm0 \n" - "movdqa 0x20(%1,%2,1),%%xmm4 \n" + "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" "paddd %%xmm0,%%xmm4 \n" "paddd %%xmm5,%%xmm0 \n" - "movdqa 0x30(%1,%2,1),%%xmm5 \n" + "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" + "lea " MEMLEA(0x40,2) ",%2 \n" "paddd %%xmm0,%%xmm5 \n" - "movdqa %%xmm2,(%1) \n" - "movdqa %%xmm3,0x10(%1) \n" - "movdqa %%xmm4,0x20(%1) \n" - "movdqa %%xmm5,0x30(%1) \n" - "lea 0x40(%1),%1 \n" + "movdqa %%xmm2," MEMACCESS(1) " \n" + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" + "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" + "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" "sub $0x4,%3 \n" "jge 40b \n" @@ -4567,15 +5157,16 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, // 1 pixel loop \n" ".p2align 2 \n" "10: \n" - "movd (%0),%%xmm2 \n" - "lea 0x4(%0),%0 \n" + "movd " MEMACCESS(0) ",%%xmm2 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" "punpcklbw %%xmm1,%%xmm2 \n" "punpcklwd %%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm0 \n" - "movdqu (%1,%2,1),%%xmm2 \n" + "movdqu " MEMACCESS(2) ",%%xmm2 \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "paddd %%xmm0,%%xmm2 \n" - "movdqu %%xmm2,(%1) \n" - "lea 0x10(%1),%1 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x1,%3 \n" "jge 10b \n" @@ -4598,34 +5189,83 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, int width, int area, uint8* dst, int count) { asm volatile ( - "movd %5,%%xmm4 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "rcpss %%xmm4,%%xmm4 \n" + "movd %5,%%xmm5 \n" + "cvtdq2ps %%xmm5,%%xmm5 \n" + "rcpss %%xmm5,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "sub $0x4,%3 \n" "jl 49f \n" + "cmpl $0x80,%5 \n" + "ja 40f \n" + + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrld $0x10,%%xmm6 \n" + "cvtdq2ps %%xmm6,%%xmm6 \n" + "addps %%xmm6,%%xmm5 \n" + "mulps %%xmm4,%%xmm5 \n" + "cvtps2dq %%xmm5,%%xmm5 \n" + "packssdw %%xmm5,%%xmm5 \n" + + // 4 pixel small loop \n" + ".p2align 2 \n" + BUNDLEALIGN + "4: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packssdw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm5,%%xmm0 \n" + "pmulhuw %%xmm5,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "sub $0x4,%3 \n" + "jge 4b \n" + "jmp 49f \n" // 4 pixel loop \n" ".p2align 2 \n" + BUNDLEALIGN "40: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" - "psubd (%0,%4,4),%%xmm0 \n" - "psubd 0x10(%0,%4,4),%%xmm1 \n" - "psubd 0x20(%0,%4,4),%%xmm2 \n" - "psubd 0x30(%0,%4,4),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "psubd 0x10(%1),%%xmm1 \n" - "psubd 0x20(%1),%%xmm2 \n" - "psubd 0x30(%1),%%xmm3 \n" - "paddd (%1,%4,4),%%xmm0 \n" - "paddd 0x10(%1,%4,4),%%xmm1 \n" - "paddd 0x20(%1,%4,4),%%xmm2 \n" - "paddd 0x30(%1,%4,4),%%xmm3 \n" - "lea 0x40(%1),%1 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 + "lea " MEMLEA(0x40,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 + "lea " MEMLEA(0x40,1) ",%1 \n" "cvtdq2ps %%xmm0,%%xmm0 \n" "cvtdq2ps %%xmm1,%%xmm1 \n" "mulps %%xmm4,%%xmm0 \n" @@ -4641,8 +5281,8 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, "packssdw %%xmm1,%%xmm0 \n" "packssdw %%xmm3,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "sub $0x4,%3 \n" "jge 40b \n" @@ -4652,20 +5292,22 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, // 1 pixel loop \n" ".p2align 2 \n" + BUNDLEALIGN "10: \n" - "movdqa (%0),%%xmm0 \n" - "psubd (%0,%4,4),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "psubd (%1),%%xmm0 \n" - "paddd (%1,%4,4),%%xmm0 \n" - "lea 0x10(%1),%1 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "psubd " MEMACCESS(1) ",%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 + "lea " MEMLEA(0x10,1) ",%1 \n" "cvtdq2ps %%xmm0,%%xmm0 \n" "mulps %%xmm4,%%xmm0 \n" "cvtps2dq %%xmm0,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x4,2) ",%2 \n" "sub $0x1,%3 \n" "jge 10b \n" "19: \n" @@ -4676,27 +5318,26 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, : "r"(static_cast<intptr_t>(width)), // %4 "rm"(area) // %5 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" #endif ); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2 -// TODO(fbarchard): Find 64 bit way to avoid masking. // Copy ARGB pixels from source image with slope to a row of destination. -// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing -// an error if movq is used. movd %%xmm0,%1 - LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* src_dudv, int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp = 0; asm volatile ( - "movq (%3),%%xmm2 \n" - "movq 0x8(%3),%%xmm7 \n" + "movq " MEMACCESS(3) ",%%xmm2 \n" + "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" "shl $0x10,%1 \n" "add $0x4,%1 \n" "movd %1,%%xmm5 \n" @@ -4715,46 +5356,31 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "addps %%xmm4,%%xmm4 \n" // 4 pixel loop \n" - ".p2align 4 \n" + ".p2align 2 \n" "40: \n" - "cvttps2dq %%xmm2,%%xmm0 \n" - "cvttps2dq %%xmm3,%%xmm1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "pmaddwd %%xmm5,%%xmm0 \n" -#if defined(__x86_64__) - "movd %%xmm0,%1 \n" - "mov %1,%5 \n" - "and $0x0fffffff,%1 \n" - "shr $32,%5 \n" - "pshufd $0xEE,%%xmm0,%%xmm0 \n" -#else - "movd %%xmm0,%1 \n" + "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 + "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts + "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride + "movd %%xmm0,%k1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%5 \n" + "movd %%xmm0,%k5 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" -#endif - "movd (%0,%1,1),%%xmm1 \n" - "movd (%0,%5,1),%%xmm6 \n" + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 "punpckldq %%xmm6,%%xmm1 \n" "addps %%xmm4,%%xmm2 \n" - "movq %%xmm1,(%2) \n" -#if defined(__x86_64__) - "movd %%xmm0,%1 \n" - "mov %1,%5 \n" - "and $0x0fffffff,%1 \n" - "shr $32,%5 \n" -#else - "movd %%xmm0,%1 \n" + "movq %%xmm1," MEMACCESS(2) " \n" + "movd %%xmm0,%k1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" - "movd %%xmm0,%5 \n" -#endif - "movd (%0,%1,1),%%xmm0 \n" - "movd (%0,%5,1),%%xmm6 \n" + "movd %%xmm0,%k5 \n" + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 "punpckldq %%xmm6,%%xmm0 \n" "addps %%xmm4,%%xmm3 \n" "sub $0x4,%4 \n" - "movq %%xmm0,0x08(%2) \n" - "lea 0x10(%2),%2 \n" + "movq %%xmm0," MEMACCESS2(0x08,2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" "jge 40b \n" "49: \n" @@ -4762,20 +5388,18 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "jl 19f \n" // 1 pixel loop \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "10: \n" "cvttps2dq %%xmm2,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "pmaddwd %%xmm5,%%xmm0 \n" "addps %%xmm7,%%xmm2 \n" - "movd %%xmm0,%1 \n" -#if defined(__x86_64__) - "and $0x0fffffff,%1 \n" -#endif - "movd (%0,%1,1),%%xmm0 \n" + "movd %%xmm0,%k1 \n" + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 "sub $0x1,%4 \n" - "movd %%xmm0,(%2) \n" - "lea 0x4(%2),%2 \n" + "movd %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x04,2) ",%2 \n" "jge 10b \n" "19: \n" : "+r"(src_argb), // %0 @@ -4786,6 +5410,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, "+r"(temp) // %5 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif @@ -4793,6 +5420,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, } #endif // HAS_ARGBAFFINEROW_SSE2 +#ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, @@ -4818,10 +5446,11 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "pshufd $0x0,%%xmm5,%%xmm5 \n" // General purpose row blend. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm2) "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" @@ -4831,56 +5460,64 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" "jmp 99f \n" // Blend 25 / 75. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "25: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 25b \n" "jmp 99f \n" // Blend 50 / 50. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "50: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 50b \n" "jmp 99f \n" // Blend 75 / 25. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "75: \n" - "movdqa (%1),%%xmm1 \n" - "movdqa (%1,%4,1),%%xmm0 \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm0) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 75b \n" "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "100: \n" - "movdqa (%1),%%xmm0 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 100b \n" "99: \n" @@ -4890,11 +5527,15 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "+r"(source_y_fraction) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm5" #endif ); } +#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 @@ -4923,10 +5564,11 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, "pxor %%xmm4,%%xmm4 \n" // General purpose row blend. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm2 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" @@ -4943,56 +5585,64 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, "paddw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" "jmp 99f \n" // Blend 25 / 75. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "25: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 25b \n" "jmp 99f \n" // Blend 50 / 50. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "50: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 50b \n" "jmp 99f \n" // Blend 75 / 25. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "75: \n" - "movdqa (%1),%%xmm1 \n" - "movdqa (%1,%4,1),%%xmm0 \n" + "movdqa " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 75b \n" "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "100: \n" - "movdqa (%1),%%xmm0 \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 100b \n" "99: \n" @@ -5002,6 +5652,9 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, "+r"(source_y_fraction) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -5009,6 +5662,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 +#ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, @@ -5034,10 +5688,11 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "pshufd $0x0,%%xmm5,%%xmm5 \n" // General purpose row blend. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) "movdqu %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" @@ -5047,56 +5702,64 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" "jmp 99f \n" // Blend 25 / 75. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "25: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm1 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 25b \n" "jmp 99f \n" // Blend 50 / 50. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm1 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 50b \n" "jmp 99f \n" // Blend 75 / 25. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "75: \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%1,%4,1),%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 75b \n" "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "100: \n" - "movdqu (%1),%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 100b \n" "99: \n" @@ -5106,11 +5769,15 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "+r"(source_y_fraction) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm5" #endif ); } +#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 @@ -5139,10 +5806,11 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, "pxor %%xmm4,%%xmm4 \n" // General purpose row blend. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm2 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 "movdqu %%xmm0,%%xmm1 \n" "movdqu %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" @@ -5159,56 +5827,64 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, "paddw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 1b \n" "jmp 99f \n" // Blend 25 / 75. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "25: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm1 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 25b \n" "jmp 99f \n" // Blend 50 / 50. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm1 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 50b \n" "jmp 99f \n" // Blend 75 / 25. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "75: \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%1,%4,1),%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + BUNDLEALIGN + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 75b \n" "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "100: \n" - "movdqu (%1),%%xmm0 \n" + "movdqu " MEMACCESS(1) ",%%xmm0 \n" "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) + "lea " MEMLEA(0x10,1) ",%1 \n" "jg 100b \n" "99: \n" @@ -5218,6 +5894,9 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, "+r"(source_y_fraction) // %3 : "r"(static_cast<intptr_t>(src_stride)) // %4 : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif @@ -5225,17 +5904,19 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 +#ifdef HAS_HALFROW_SSE2 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { asm volatile ( "sub %0,%1 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movdqa (%0),%%xmm0 \n" - "pavgb (%0,%3),%%xmm0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 "sub $0x10,%2 \n" - "movdqa %%xmm0,(%0,%1) \n" - "lea 0x10(%0),%0 \n" + MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) + "lea " MEMLEA(0x10,0) ",%0 \n" "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 @@ -5247,23 +5928,26 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, #endif ); } +#endif // HAS_HALFROW_SSE2 +#ifdef HAS_ARGBTOBAYERROW_SSSE3 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { asm volatile ( + // NaCL caveat - assumes movd is from GPR "movd %3,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" - ".p2align 4 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "punpckldq %%xmm1,%%xmm0 \n" "sub $0x8,%2 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 @@ -5275,23 +5959,58 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, #endif ); } +#endif // HAS_ARGBTOBAYERROW_SSSE3 + +#ifdef HAS_ARGBTOBAYERGGROW_SSE2 +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + ".p2align 2 \n" + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrld $0x8,%%xmm0 \n" + "psrld $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packssdw %%xmm1,%%xmm0 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x8,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_bayer), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBTOBAYERGGROW_SSE2 +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "movdqa (%3),%%xmm5 \n" - ".p2align 4 \n" + "movdqa " MEMACCESS(3) ",%%xmm5 \n" + ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -5307,18 +6026,48 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "movdqa (%3),%%xmm5 \n" - ".p2align 4 \n" + "movdqa " MEMACCESS(3) ",%%xmm5 \n" + ".p2align 2 \n" "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSSE3 + +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" + ".p2align 2 \n" + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "sub $0x10,%2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" + "lea " MEMLEA(0x40,1) ",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -5330,27 +6079,156 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, #endif ); } +#endif // HAS_ARGBSHUFFLEROW_AVX2 +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + uintptr_t pixel_temp = 0u; + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" + "mov " MEMACCESS(4) ",%k2 \n" + "cmp $0x3000102,%k2 \n" + "je 3012f \n" + "cmp $0x10203,%k2 \n" + "je 123f \n" + "cmp $0x30201,%k2 \n" + "je 321f \n" + "cmp $0x2010003,%k2 \n" + "je 2103f \n" + + BUNDLEALIGN + "1: \n" + "movzb " MEMACCESS(4) ",%2 \n" + MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS(1) " \n" + "movzb " MEMACCESS2(0x1,4) ",%2 \n" + MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x1,1) " \n" + BUNDLEALIGN + "movzb " MEMACCESS2(0x2,4) ",%2 \n" + MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x2,1) " \n" + "movzb " MEMACCESS2(0x3,4) ",%2 \n" + MEMOP(movzb,0x00,0,2,1) ",%2 \n" // movzb (%0,%2,1),%2 + "mov %b2," MEMACCESS2(0x3,1) " \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + "lea " MEMLEA(0x4,1) ",%1 \n" + "sub $0x1,%3 \n" + "jg 1b \n" + "jmp 99f \n" + + ".p2align 2 \n" + "123: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm1,%%xmm1 \n" + "pshuflw $0x1b,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 123b \n" + "jmp 99f \n" + + ".p2align 2 \n" + "321: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x39,%%xmm0,%%xmm0 \n" + "pshuflw $0x39,%%xmm0,%%xmm0 \n" + "pshufhw $0x39,%%xmm1,%%xmm1 \n" + "pshuflw $0x39,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 321b \n" + "jmp 99f \n" + + ".p2align 2 \n" + "2103: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0x93,%%xmm0,%%xmm0 \n" + "pshuflw $0x93,%%xmm0,%%xmm0 \n" + "pshufhw $0x93,%%xmm1,%%xmm1 \n" + "pshuflw $0x93,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 2103b \n" + "jmp 99f \n" + + ".p2align 2 \n" + "3012: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpckhbw %%xmm5,%%xmm1 \n" + "pshufhw $0xc6,%%xmm0,%%xmm0 \n" + "pshuflw $0xc6,%%xmm0,%%xmm0 \n" + "pshufhw $0xc6,%%xmm1,%%xmm1 \n" + "pshuflw $0xc6,%%xmm1,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 3012b \n" + + "99: \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+d"(pixel_temp), // %2 + "+r"(pix) // %3 + : "r"(shuffler) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_SSE2 + +#ifdef HAS_I422TOYUY2ROW_SSE2 void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_frame, int width) { asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movq (%1),%%xmm2 \n" - "movq (%1,%2,1),%%xmm3 \n" - "lea 0x8(%1),%1 \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" + "movdqu %%xmm0," MEMACCESS(3) " \n" + "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_y), // %0 @@ -5360,32 +6238,38 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, "+rm"(width) // %4 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3" #endif ); } +#endif // HAS_I422TOYUY2ROW_SSE2 +#ifdef HAS_I422TOUYVYROW_SSE2 void I422ToUYVYRow_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_frame, int width) { asm volatile ( "sub %1,%2 \n" - ".p2align 4 \n" + ".p2align 2 \n" + BUNDLEALIGN "1: \n" - "movq (%1),%%xmm2 \n" - "movq (%1,%2,1),%%xmm3 \n" - "lea 0x8(%1),%1 \n" + "movq " MEMACCESS(1) ",%%xmm2 \n" + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 + "lea " MEMLEA(0x8,1) ",%1 \n" "punpcklbw %%xmm3,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" + "movdqu %%xmm1," MEMACCESS(3) " \n" + "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" + "lea " MEMLEA(0x20,3) ",%3 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_y), // %0 @@ -5395,11 +6279,300 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, "+rm"(width) // %4 : : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3" #endif ); } +#endif // HAS_I422TOUYVYROW_SSE2 + +#ifdef HAS_FIXEDDIV_X86 +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} +#endif // HAS_FIXEDDIV_X86 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + ".p2align 2 \n" + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" + "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" + "addps " MEMACCESS(3) ",%%xmm0 \n" + "addps " MEMACCESS(3) ",%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" + "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" + "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x2,%2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" + "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" + "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" + "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" + + // 2 pixel loop. + ".p2align 2 \n" + "1: \n" + "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels + "lea " MEMLEA(0x8,0) ",%0 \n" + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X + "vcvttps2dq %%ymm0,%%ymm0 \n" + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" + "sub $0x2,%2 \n" + "vmovq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" +#if defined(__SSE2__) +// TODO(fbarchard): declare ymm usage when applicable. + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOP(movzb,0x00,3,1,4) ",%1 \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOP(movzb,0x01,3,1,4) ",%1 \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOP(movzb,0x02,3,1,4) ",%1 \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "movzb " MEMACCESS2(-0x1,0) ",%1 \n" + MEMOP(movzb,0x03,3,1,4) ",%1 \n" // movzb 0x3(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x1,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + uintptr_t pixel_temp = 0u; + asm volatile ( + // 1 pixel loop. + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movzb " MEMACCESS(0) ",%1 \n" + "lea " MEMLEA(0x4,0) ",%0 \n" + MEMOP(movzb,0x00,3,1,4) ",%1 \n" // movzb (%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x4,0) " \n" + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" + MEMOP(movzb,0x01,3,1,4) ",%1 \n" // movzb 0x1(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x3,0) " \n" + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" + MEMOP(movzb,0x02,3,1,4) ",%1 \n" // movzb 0x2(%3,%1,4),%1 + "mov %b1," MEMACCESS2(-0x2,0) " \n" + "dec %2 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+d"(pixel_temp), // %1 + "+r"(width) // %2 + : "r"(table_argb) // %3 + : "memory", "cc"); +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + uintptr_t pixel_temp = 0u; + uintptr_t table_temp = 0u; + asm volatile ( + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqu " MEMACCESS(2) ",%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS(2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS(3) " \n" + "movzb " MEMACCESS2(0x1,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x1,3) " \n" + "movzb " MEMACCESS2(0x2,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x2,3) " \n" + "movzb " MEMACCESS2(0x3,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x3,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x4,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x4,3) " \n" + BUNDLEALIGN + "movzb " MEMACCESS2(0x5,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x5,3) " \n" + "movzb " MEMACCESS2(0x6,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x6,3) " \n" + "movzb " MEMACCESS2(0x7,2) ",%0 \n" + "mov %b0," MEMACCESS2(0x7,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb " MEMACCESS2(0x8,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x8,3) " \n" + "movzb " MEMACCESS2(0x9,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0x9,3) " \n" + "movzb " MEMACCESS2(0xa,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xa,3) " \n" + "movzb " MEMACCESS2(0xb,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xb,3) " \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb " MEMACCESS2(0xc,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xc,3) " \n" + "movzb " MEMACCESS2(0xd,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xd,3) " \n" + "movzb " MEMACCESS2(0xe,2) ",%0 \n" + MEMOP(movzb,0x00,1,0,1) ",%0 \n" // movzb (%1,%0,1),%0 + "mov %b0," MEMACCESS2(0xe,3) " \n" + "movzb " MEMACCESS2(0xf,2) ",%0 \n" + "mov %b0," MEMACCESS2(0xf,3) " \n" + "sub $0x4,%4 \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "lea " MEMLEA(0x10,3) ",%3 \n" + "jg 1b \n" + : "+d"(pixel_temp), // %0 + "+a"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 #endif // defined(__x86_64__) || defined(__i386__) diff --git a/chromium/third_party/libyuv/source/row_win.cc b/chromium/third_party/libyuv/source/row_win.cc index 4ea06923def..502d25cea4f 100644 --- a/chromium/third_party/libyuv/source/row_win.cc +++ b/chromium/third_party/libyuv/source/row_win.cc @@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = { 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 }; -static const lvec8 kARGBToY_AVX = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; - -static const lvec8 kARGBToYJ_AVX = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; - static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; @@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = { 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 }; -// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version. -static const lvec8 kARGBToU_AVX = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; - static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; @@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = { -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 }; -static const lvec8 kARGBToV_AVX = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 -}; - // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kShufARGBToY_AVX = { +static const lvec32 kPermdARGBToY_AVX = { 0, 4, 1, 5, 2, 6, 3, 7 }; @@ -124,16 +103,6 @@ static const uvec8 kAddY16 = { static const vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; -static const lvec16 kAddYJ64_AVX = { - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 -}; - -static const ulvec8 kAddY16_AVX = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; static const uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, @@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = { 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u }; -static const ulvec8 kAddUV128_AVX = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; - // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u @@ -191,7 +153,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 - align 16 + align 4 convertloop: movq xmm0, qword ptr [eax] lea eax, [eax + 8] @@ -220,7 +182,7 @@ void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 - align 16 + align 4 convertloop: movq xmm0, qword ptr [eax] lea eax, [eax + 8] @@ -249,7 +211,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { pslld xmm5, 24 movdqa xmm4, kShuffleMaskRGB24ToARGB - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -289,7 +251,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, pslld xmm5, 24 movdqa xmm4, kShuffleMaskRAWToARGB - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -349,7 +311,7 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, sub edx, eax sub edx, eax - align 16 + align 4 convertloop: movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqa xmm1, xmm0 @@ -399,7 +361,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, sub edx, eax sub edx, eax - align 16 + align 4 convertloop: movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqa xmm1, xmm0 @@ -445,7 +407,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, sub edx, eax sub edx, eax - align 16 + align 4 convertloop: movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqa xmm2, xmm0 @@ -477,12 +439,12 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { mov ecx, [esp + 12] // pix movdqa xmm6, kShuffleMaskARGBToRGB24 - align 16 + align 4 convertloop: - movdqa xmm0, [eax] // fetch 16 pixels of argb - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] lea eax, [eax + 64] pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 @@ -494,13 +456,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { movdqa xmm5, xmm2 // 8 bytes from 2 for 1 por xmm0, xmm4 // 4 bytes from 1 for 0 pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqa [edx], xmm0 // store 0 + movdqu [edx], xmm0 // store 0 por xmm1, xmm5 // 8 bytes from 2 for 1 psrldq xmm2, 8 // 4 bytes from 2 pslldq xmm3, 4 // 12 bytes from 3 for 2 por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqa [edx + 16], xmm1 // store 1 - movdqa [edx + 32], xmm2 // store 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -516,12 +478,12 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { mov ecx, [esp + 12] // pix movdqa xmm6, kShuffleMaskARGBToRAW - align 16 + align 4 convertloop: - movdqa xmm0, [eax] // fetch 16 pixels of argb - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] lea eax, [eax + 64] pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 @@ -533,13 +495,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { movdqa xmm5, xmm2 // 8 bytes from 2 for 1 por xmm0, xmm4 // 4 bytes from 1 for 0 pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqa [edx], xmm0 // store 0 + movdqu [edx], xmm0 // store 0 por xmm1, xmm5 // 8 bytes from 2 for 1 psrldq xmm2, 8 // 4 bytes from 2 pslldq xmm3, 4 // 12 bytes from 3 for 2 por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqa [edx + 16], xmm1 // store 1 - movdqa [edx + 32], xmm2 // store 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -561,7 +523,7 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B @@ -601,7 +563,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pslld xmm7, 15 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B @@ -639,7 +601,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { movdqa xmm3, xmm4 // generate mask 0x00f000f0 psrlw xmm3, 8 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 @@ -668,7 +630,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kARGBToY - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -703,7 +665,7 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm4, kARGBToYJ movdqa xmm5, kAddYJ64 - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -737,11 +699,11 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - vmovdqa ymm6, kShufARGBToY_AVX - vmovdqa ymm5, kAddY16_AVX - vmovdqa ymm4, kARGBToY_AVX + vbroadcastf128 ymm4, kARGBToY + vbroadcastf128 ymm5, kAddY16 + vmovdqa ymm6, kPermdARGBToY_AVX - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -777,11 +739,11 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - vmovdqa ymm4, kARGBToYJ_AVX - vmovdqa ymm5, kAddYJ64_AVX - vmovdqa ymm6, kShufARGBToY_AVX + vbroadcastf128 ymm4, kARGBToYJ + vbroadcastf128 ymm5, kAddYJ64 + vmovdqa ymm6, kPermdARGBToY_AVX - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -820,7 +782,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kARGBToY - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -854,7 +816,7 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm4, kARGBToYJ movdqa xmm5, kAddYJ64 - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -889,7 +851,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kBGRAToY - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -923,7 +885,7 @@ void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kBGRAToY - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -957,7 +919,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kABGRToY - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -991,7 +953,7 @@ void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kABGRToY - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -1025,7 +987,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kRGBAToY - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -1059,7 +1021,7 @@ void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { movdqa xmm5, kAddY16 movdqa xmm4, kRGBAToY - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -1100,7 +1062,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] @@ -1166,7 +1128,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUVJ128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] @@ -1229,12 +1191,12 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - vmovdqa ymm7, kARGBToU_AVX - vmovdqa ymm6, kARGBToV_AVX - vmovdqa ymm5, kAddUV128_AVX + vbroadcastf128 ymm5, kAddUV128 + vbroadcastf128 ymm6, kARGBToV + vbroadcastf128 ymm7, kARGBToU sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] @@ -1300,7 +1262,7 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] @@ -1370,7 +1332,7 @@ void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUVJ128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] @@ -1439,7 +1401,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* convert to U and V */ movdqa xmm0, [eax] // U @@ -1497,7 +1459,7 @@ void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* convert to U and V */ movdqu xmm0, [eax] // U @@ -1555,7 +1517,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] @@ -1614,7 +1576,7 @@ void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] @@ -1675,7 +1637,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] @@ -1741,7 +1703,7 @@ void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] @@ -1811,7 +1773,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] @@ -1877,7 +1839,7 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] @@ -1947,7 +1909,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqa xmm0, [eax] @@ -2013,7 +1975,7 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, movdqa xmm5, kAddUV128 sub edi, edx // stride from u to v - align 16 + align 4 convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] @@ -2133,7 +2095,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpxor ymm4, ymm4, ymm4 - align 16 + align 4 convertloop: vmovq xmm0, qword ptr [esi] // U vmovq xmm1, qword ptr [esi + edi] // V @@ -2220,7 +2182,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. -// Read 8 UV from 411. +// Read 8 UV from 444. #define READYUV444 __asm { \ __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ @@ -2239,8 +2201,10 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; // Read 2 UV from 411, upsample to 8 UV. #define READYUV411 __asm { \ - __asm movd xmm0, [esi] /* U */ \ - __asm movd xmm1, [esi + edi] /* V */ \ + __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ + __asm movd xmm0, ebx \ + __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm movd xmm1, ebx \ __asm lea esi, [esi + 2] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ @@ -2330,7 +2294,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV444 YUVTORGB @@ -2374,7 +2338,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, movdqa xmm5, kShuffleMaskARGBToRGB24_0 movdqa xmm6, kShuffleMaskARGBToRGB24 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -2421,7 +2385,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, movdqa xmm5, kShuffleMaskARGBToRAW_0 movdqa xmm6, kShuffleMaskARGBToRAW - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -2473,7 +2437,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 pslld xmm7, 11 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -2540,7 +2504,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -2573,20 +2537,21 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, uint8* dst_argb, int width) { __asm { + push ebx push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ecx, [esp + 12 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: - READYUV411 + READYUV411 // modifies EBX YUVTORGB // Step 3: Weave into ARGB @@ -2603,6 +2568,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, pop edi pop esi + pop ebx ret } } @@ -2623,7 +2589,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READNV12 YUVTORGB @@ -2661,7 +2627,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READNV12 YVUTORGB @@ -2703,7 +2669,7 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV444 YUVTORGB @@ -2746,7 +2712,7 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -2779,20 +2745,21 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, uint8* dst_argb, int width) { __asm { + push ebx push esi push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // argb - mov ecx, [esp + 8 + 20] // width + mov eax, [esp + 12 + 4] // Y + mov esi, [esp + 12 + 8] // U + mov edi, [esp + 12 + 12] // V + mov edx, [esp + 12 + 16] // argb + mov ecx, [esp + 12 + 20] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: - READYUV411 + READYUV411 // modifies EBX YUVTORGB // Step 3: Weave into ARGB @@ -2809,6 +2776,7 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, pop edi pop esi + pop ebx ret } } @@ -2829,7 +2797,7 @@ void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READNV12 YUVTORGB @@ -2867,7 +2835,7 @@ void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READNV12 YVUTORGB @@ -2906,7 +2874,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, sub edi, esi pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -2947,7 +2915,7 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, sub edi, esi pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -2989,7 +2957,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -3030,7 +2998,7 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -3070,7 +3038,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, sub edi, esi pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -3111,7 +3079,7 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, sub edi, esi pxor xmm4, xmm4 - align 16 + align 4 convertloop: READYUV422 YUVTORGB @@ -3156,7 +3124,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, mov edx, [esp + 8] // rgb mov ecx, [esp + 12] // width - align 16 + align 4 convertloop: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] @@ -3200,7 +3168,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { movdqa xmm5, kShuffleMirror lea eax, [eax - 16] - align 16 + align 4 convertloop: movdqa xmm0, [eax + ecx] pshufb xmm0, xmm5 @@ -3229,7 +3197,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { vmovdqa ymm5, kShuffleMirror_AVX2 lea eax, [eax - 32] - align 16 + align 4 convertloop: vmovdqu ymm0, [eax + ecx] vpshufb ymm0, ymm0, ymm5 @@ -3255,7 +3223,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { mov ecx, [esp + 12] // width lea eax, [eax - 16] - align 16 + align 4 convertloop: movdqu xmm0, [eax + ecx] movdqa xmm1, xmm0 // swap bytes @@ -3293,7 +3261,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, lea eax, [eax + ecx * 2 - 16] sub edi, edx - align 16 + align 4 convertloop: movdqa xmm0, [eax] lea eax, [eax - 16] @@ -3322,12 +3290,13 @@ void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width + lea eax, [eax - 16 + ecx * 4] // last 4 pixels. movdqa xmm5, kARGBShuffleMirror - lea eax, [eax - 16] - align 16 + align 4 convertloop: - movdqa xmm0, [eax + ecx * 4] + movdqa xmm0, [eax] + lea eax, [eax - 16] pshufb xmm0, xmm5 sub ecx, 4 movdqa [edx], xmm0 @@ -3353,7 +3322,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { lea eax, [eax - 32] vmovdqa ymm5, kARGBShuffleMirror_AVX2 - align 16 + align 4 convertloop: vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order sub ecx, 8 @@ -3379,7 +3348,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -3416,7 +3385,7 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -3454,7 +3423,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { vpsrlw ymm5, ymm5, 8 sub edi, edx - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3492,7 +3461,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, mov ecx, [esp + 4 + 16] // width sub edx, eax - align 16 + align 4 convertloop: movdqa xmm0, [eax] // read 16 U's movdqa xmm1, [eax + edx] // and 16 V's @@ -3522,7 +3491,7 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, mov ecx, [esp + 4 + 16] // width sub edx, eax - align 16 + align 4 convertloop: movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's @@ -3554,7 +3523,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, mov ecx, [esp + 4 + 16] // width sub edx, eax - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] // read 32 U's vmovdqu ymm1, [eax + edx] // and 32 V's @@ -3584,15 +3553,15 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // count - sub edx, eax - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] - movdqa [eax + edx], xmm0 - movdqa [eax + edx + 16], xmm1 lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] sub ecx, 32 jg convertloop ret @@ -3634,6 +3603,144 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_X86 +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + align 4 + convertloop: + movdqa xmm2, [eax] + movdqa xmm3, [eax + 16] + lea eax, [eax + 32] + movdqa xmm4, [edx] + movdqa xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + align 4 + convertloop: + vmovdqu ymm1, [eax] + vmovdqu ymm2, [eax + 32] + lea eax, [eax + 64] + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYALPHAROW_AVX2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + pcmpeqb xmm0, xmm0 // generate mask 0xff000000 + pslld xmm0, 24 + pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff + psrld xmm1, 8 + + align 4 + convertloop: + movq xmm2, qword ptr [eax] // 8 Y's + lea eax, [eax + 8] + punpcklbw xmm2, xmm2 + punpckhwd xmm3, xmm2 + punpcklwd xmm2, xmm2 + movdqa xmm4, [edx] + movdqa xmm5, [edx + 16] + pand xmm2, xmm0 + pand xmm3, xmm0 + pand xmm4, xmm1 + pand xmm5, xmm1 + por xmm2, xmm4 + por xmm3, xmm5 + movdqa [edx], xmm2 + movdqa [edx + 16], xmm3 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 + +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +// width in pixels +__declspec(naked) __declspec(align(16)) +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + vpcmpeqb ymm0, ymm0, ymm0 + vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff + + align 4 + convertloop: + vpmovzxbd ymm1, qword ptr [eax] + vpmovzxbd ymm2, qword ptr [eax + 8] + lea eax, [eax + 16] + vpslld ymm1, ymm1, 24 + vpslld ymm2, ymm2, 24 + vpblendvb ymm1, ymm1, [edx], ymm0 + vpblendvb ymm2, ymm2, [edx + 32], ymm0 + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 + #ifdef HAS_SETROW_X86 // SetRow8 writes 'count' bytes using a 32 bit value repeated. __declspec(naked) __declspec(align(16)) @@ -3666,7 +3773,7 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, lea ecx, [ebp * 4] sub edx, ecx // stride - width * 4 - align 16 + align 4 convertloop: mov ecx, ebp rep stosd @@ -3693,7 +3800,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3726,7 +3833,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3769,7 +3876,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3804,7 +3911,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // pix - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3837,7 +3944,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3880,7 +3987,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, vpsrlw ymm5, ymm5, 8 sub edi, edx - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -3919,7 +4026,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -3950,7 +4057,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -3992,7 +4099,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -4026,7 +4133,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4057,7 +4164,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4099,7 +4206,7 @@ void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4131,7 +4238,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // pix - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -4162,7 +4269,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -4204,7 +4311,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -4236,7 +4343,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // pix - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4267,7 +4374,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4309,7 +4416,7 @@ void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, psrlw xmm5, 8 sub edi, edx - align 16 + align 4 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -4479,7 +4586,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 1 + pcmpeqb xmm7, xmm7 // generate constant 0x0001 psrlw xmm7, 15 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff psrlw xmm6, 8 @@ -4624,13 +4731,12 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { mov eax, [esp + 4] // src_argb0 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - sub edx, eax pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff psrld xmm5, 8 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm0 // first 2 @@ -4643,6 +4749,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pshuflw xmm2, xmm2, 0FFh pmulhuw xmm1, xmm2 // rgb * a movdqa xmm2, [eax] // alphas + lea eax, [eax + 16] psrlw xmm0, 8 pand xmm2, xmm4 psrlw xmm1, 8 @@ -4650,8 +4757,8 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pand xmm0, xmm5 // keep original alphas por xmm0, xmm2 sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] + movdqa [edx], xmm0 + lea edx, [edx + 16] jg convertloop ret @@ -4674,33 +4781,33 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { mov eax, [esp + 4] // src_argb0 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width - sub edx, eax pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pslld xmm3, 24 movdqa xmm4, kShuffleAlpha0 movdqa xmm5, kShuffleAlpha1 - align 16 + align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels pshufb xmm0, xmm4 // isolate first 2 alphas - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels punpcklbw xmm1, xmm1 // first 2 pixel rgbs pmulhuw xmm0, xmm1 // rgb * a - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels pshufb xmm1, xmm5 // isolate next 2 alphas - movdqa xmm2, [eax] // read 4 pixels + movdqu xmm2, [eax] // read 4 pixels punpckhbw xmm2, xmm2 // next 2 pixel rgbs pmulhuw xmm1, xmm2 // rgb * a - movdqa xmm2, [eax] // mask original alpha + movdqu xmm2, [eax] // mask original alpha + lea eax, [eax + 16] pand xmm2, xmm3 psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 por xmm0, xmm2 // copy original alpha sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jg convertloop ret @@ -4727,7 +4834,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 - align 16 + align 4 convertloop: vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. @@ -4764,11 +4871,10 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 8 + 4] // src_argb0 mov edx, [esp + 8 + 8] // dst_argb mov ecx, [esp + 8 + 12] // width - sub edx, eax - align 16 + align 4 convertloop: - movdqa xmm0, [eax] // read 4 pixels + movdqu xmm0, [eax] // read 4 pixels movzx esi, byte ptr [eax + 3] // first alpha movzx edi, byte ptr [eax + 7] // second alpha punpcklbw xmm0, xmm0 // first 2 @@ -4779,7 +4885,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, movlhps xmm2, xmm3 pmulhuw xmm0, xmm2 // rgb * a - movdqa xmm1, [eax] // read 4 pixels + movdqu xmm1, [eax] // read 4 pixels movzx esi, byte ptr [eax + 11] // third alpha movzx edi, byte ptr [eax + 15] // forth alpha punpckhbw xmm1, xmm1 // next 2 @@ -4789,11 +4895,12 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 pmulhuw xmm1, xmm2 // rgb * a + lea eax, [eax + 16] packuswb xmm0, xmm1 sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jg convertloop pop edi pop esi @@ -4821,7 +4928,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, sub edx, eax vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 - align 16 + align 4 convertloop: vmovdqu ymm6, [eax] // read 8 pixels. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. @@ -4860,7 +4967,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, push esi push edi - align 16 + align 4 convertloop: // replace VPGATHER movzx esi, byte ptr [eax + 3] // alpha0 @@ -4922,9 +5029,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { mov ecx, [esp + 12] /* width */ movdqa xmm4, kARGBToYJ movdqa xmm5, kAddYJ64 - sub edx, eax - align 16 + align 4 convertloop: movdqa xmm0, [eax] // G movdqa xmm1, [eax + 16] @@ -4936,6 +5042,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { packuswb xmm0, xmm0 // 8 G bytes movdqa xmm2, [eax] // A movdqa xmm3, [eax + 16] + lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 packuswb xmm2, xmm3 @@ -4947,9 +5054,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 sub ecx, 8 - movdqa [eax + edx], xmm0 - movdqa [eax + edx + 16], xmm1 - lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] jg convertloop ret } @@ -4983,7 +5090,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { movdqa xmm3, kARGBToSepiaG movdqa xmm4, kARGBToSepiaR - align 16 + align 4 convertloop: movdqa xmm0, [eax] // B movdqa xmm6, [eax + 16] @@ -5033,111 +5140,65 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. __declspec(naked) __declspec(align(16)) -void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, - int width) { +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { __asm { - mov eax, [esp + 4] /* dst_argb */ - mov edx, [esp + 8] /* matrix_argb */ - mov ecx, [esp + 12] /* width */ - movd xmm2, [edx] - movd xmm3, [edx + 4] - movd xmm4, [edx + 8] - pshufd xmm2, xmm2, 0 - pshufd xmm3, xmm3, 0 - pshufd xmm4, xmm4, 0 + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ + movdqu xmm5, [ecx] + pshufd xmm2, xmm5, 0x00 + pshufd xmm3, xmm5, 0x55 + pshufd xmm4, xmm5, 0xaa + pshufd xmm5, xmm5, 0xff + mov ecx, [esp + 16] /* width */ - align 16 + align 4 convertloop: movdqa xmm0, [eax] // B - movdqa xmm6, [eax + 16] + movdqa xmm7, [eax + 16] pmaddubsw xmm0, xmm2 - pmaddubsw xmm6, xmm2 - movdqa xmm5, [eax] // G + pmaddubsw xmm7, xmm2 + movdqa xmm6, [eax] // G movdqa xmm1, [eax + 16] - pmaddubsw xmm5, xmm3 + pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm6 // B - phaddsw xmm5, xmm1 // G - psraw xmm0, 7 // B - psraw xmm5, 7 // G + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G packuswb xmm0, xmm0 // 8 B values - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values - movdqa xmm5, [eax] // R - movdqa xmm1, [eax + 16] - pmaddubsw xmm5, xmm4 + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values + movdqa xmm1, [eax] // R + movdqa xmm7, [eax + 16] pmaddubsw xmm1, xmm4 - phaddsw xmm5, xmm1 - psraw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values + pmaddubsw xmm7, xmm4 + phaddsw xmm1, xmm7 // R movdqa xmm6, [eax] // A - movdqa xmm1, [eax + 16] - psrld xmm6, 24 - psrld xmm1, 24 - packuswb xmm6, xmm1 + movdqa xmm7, [eax + 16] + pmaddubsw xmm6, xmm5 + pmaddubsw xmm7, xmm5 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values packuswb xmm6, xmm6 // 8 A values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklbw xmm5, xmm6 // 8 RA values - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 sub ecx, 8 - movdqa [eax], xmm0 - movdqa [eax + 16], xmm1 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm6 lea eax, [eax + 32] + lea edx, [edx + 32] jg convertloop ret } } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 -#ifdef HAS_ARGBCOLORTABLEROW_X86 -// Tranform ARGB pixels with color table. -__declspec(naked) __declspec(align(16)) -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, - int width) { - __asm { - push ebx - push esi - push edi - push ebp - mov eax, [esp + 16 + 4] /* dst_argb */ - mov edi, [esp + 16 + 8] /* table_argb */ - mov ecx, [esp + 16 + 12] /* width */ - xor ebx, ebx - xor edx, edx - - align 16 - convertloop: - mov ebp, dword ptr [eax] // BGRA - mov esi, ebp - and ebp, 255 - shr esi, 8 - and esi, 255 - mov bl, [edi + ebp * 4 + 0] // B - mov dl, [edi + esi * 4 + 1] // G - mov ebp, dword ptr [eax] // BGRA - mov esi, ebp - shr ebp, 16 - shr esi, 24 - and ebp, 255 - mov [eax], bl - mov [eax + 1], dl - mov bl, [edi + ebp * 4 + 2] // R - mov dl, [edi + esi * 4 + 3] // A - mov [eax + 2], bl - mov [eax + 3], dl - lea eax, [eax + 4] - sub ecx, 1 - jg convertloop - pop ebp - pop edi - pop esi - pop ebx - ret - } -} -#endif // HAS_ARGBCOLORTABLEROW_X86 - #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). // Aligned to 16 bytes. @@ -5160,7 +5221,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, pcmpeqb xmm6, xmm6 // generate mask 0xff000000 pslld xmm6, 24 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm5 // first 2 pixels @@ -5196,13 +5257,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value - sub edx, eax punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // read 4 pixels + lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 // first 2 punpckhbw xmm1, xmm1 // next 2 @@ -5212,8 +5273,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] + movdqa [edx], xmm0 + lea edx, [edx + 16] jg convertloop ret @@ -5233,25 +5294,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 - sub esi, eax - sub edx, eax - align 16 + align 4 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm2, [eax + esi] // read 4 pixels from src_argb1 + movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + lea eax, [eax + 16] + lea esi, [esi + 16] packuswb xmm0, xmm1 sub ecx, 4 - movdqu [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jg convertloop pop esi @@ -5272,20 +5333,20 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax sub ecx, 4 jl convertloop49 - align 16 + align 4 convertloop4: movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] paddusb xmm0, xmm1 // src_argb0 + src_argb1 sub ecx, 4 - movdqu [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jge convertloop4 convertloop49: @@ -5294,11 +5355,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, convertloop1: movd xmm0, [eax] // read 1 pixels from src_argb0 - movd xmm1, [eax + esi] // read 1 pixels from src_argb1 + lea eax, [eax + 4] + movd xmm1, [esi] // read 1 pixels from src_argb1 + lea esi, [esi + 4] paddusb xmm0, xmm1 // src_argb0 + src_argb1 sub ecx, 1 - movd [eax + edx], xmm0 - lea eax, [eax + 4] + movd [edx], xmm0 + lea edx, [edx + 4] jge convertloop1 convertloop19: @@ -5319,17 +5382,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax - align 16 + align 4 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb0 - movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1 + lea eax, [eax + 16] + movdqu xmm1, [esi] // read 4 pixels from src_argb1 + lea esi, [esi + 16] psubusb xmm0, xmm1 // src_argb0 - src_argb1 sub ecx, 4 - movdqu [eax + edx], xmm0 - lea eax, [eax + 16] + movdqu [edx], xmm0 + lea edx, [edx + 16] jg convertloop pop esi @@ -5349,14 +5412,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 - sub esi, eax - sub edx, eax + vpxor ymm5, ymm5, ymm5 // constant 0 - align 16 + align 4 convertloop: vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 - vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1 + lea eax, [eax + 32] + vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 + lea esi, [esi + 32] vpunpcklbw ymm0, ymm1, ymm1 // low 4 vpunpckhbw ymm1, ymm1, ymm1 // high 4 vpunpcklbw ymm2, ymm3, ymm5 // low 4 @@ -5364,8 +5427,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 - vmovdqu [eax + edx], ymm0 - lea eax, [eax + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -5387,15 +5450,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 - vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1 - vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -5417,15 +5480,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - sub esi, eax - sub edx, eax - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 - vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1 - vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] + vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + lea esi, [esi + 32] + vmovdqu [edx], ymm0 + lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -5436,14 +5499,14 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBSUBTRACTROW_AVX2 -#ifdef HAS_SOBELXROW_SSSE3 +#ifdef HAS_SOBELXROW_SSE2 // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 __declspec(naked) __declspec(align(16)) -void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, - const uint8* src_y2, uint8* dst_sobelx, int width) { +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { __asm { push esi push edi @@ -5457,7 +5520,7 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, sub edx, eax pxor xmm5, xmm5 // constant 0 - align 16 + align 4 convertloop: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] @@ -5477,7 +5540,9 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead. + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 sub ecx, 8 movq qword ptr [eax + edx], xmm0 @@ -5489,16 +5554,16 @@ void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, ret } } -#endif // HAS_SOBELXROW_SSSE3 +#endif // HAS_SOBELXROW_SSE2 -#ifdef HAS_SOBELYROW_SSSE3 +#ifdef HAS_SOBELYROW_SSE2 // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 __declspec(naked) __declspec(align(16)) -void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, - uint8* dst_sobely, int width) { +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_y0 @@ -5509,7 +5574,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, sub edx, eax pxor xmm5, xmm5 // constant 0 - align 16 + align 4 convertloop: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] @@ -5529,7 +5594,9 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 - pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead. + pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw + psubw xmm1, xmm0 + pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 sub ecx, 8 movq qword ptr [eax + edx], xmm0 @@ -5540,7 +5607,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, ret } } -#endif // HAS_SOBELYROW_SSSE3 +#endif // HAS_SOBELYROW_SSE2 #ifdef HAS_SOBELROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. @@ -5550,7 +5617,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, // B = Sobel __declspec(naked) __declspec(align(16)) void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_sobelx @@ -5561,7 +5628,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, pcmpeqb xmm5, xmm5 // alpha 255 pslld xmm5, 24 // 0xff000000 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // read 16 pixels src_sobelx movdqa xmm1, [eax + esi] // read 16 pixels src_sobely @@ -5594,6 +5661,36 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, } #endif // HAS_SOBELROW_SSE2 +#ifdef HAS_SOBELTOPLANEROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. +__declspec(naked) __declspec(align(16)) +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + + align 4 + convertloop: + movdqa xmm0, [eax] // read 16 pixels src_sobelx + movdqa xmm1, [eax + esi] // read 16 pixels src_sobely + lea eax, [eax + 16] + paddusb xmm0, xmm1 // sobel = sobelx + sobely + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELTOPLANEROW_SSE2 + #ifdef HAS_SOBELXYROW_SSE2 // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 @@ -5610,9 +5707,9 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax - pcmpeqb xmm5, xmm5 // alpha 255 + pcmpeqb xmm5, xmm5 // alpha 255 - align 16 + align 4 convertloop: movdqa xmm0, [eax] // read 16 pixels src_sobelx movdqa xmm1, [eax + esi] // read 16 pixels src_sobely @@ -5666,15 +5763,70 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, mov eax, topleft // eax topleft mov esi, botleft // esi botleft mov edx, width - movd xmm4, area + movd xmm5, area mov edi, dst mov ecx, count - cvtdq2ps xmm4, xmm4 - rcpss xmm4, xmm4 // 1.0f / area + cvtdq2ps xmm5, xmm5 + rcpss xmm4, xmm5 // 1.0f / area pshufd xmm4, xmm4, 0 sub ecx, 4 jl l4b + cmp area, 128 // 128 pixels will not overflow 15 bits. + ja l4 + + pshufd xmm5, xmm5, 0 // area + pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 + psrld xmm6, 16 + cvtdq2ps xmm6, xmm6 + addps xmm5, xmm6 // (65536.0 + area - 1) + mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area + cvtps2dq xmm5, xmm5 // 0.16 fixed point + packssdw xmm5, xmm5 // 16 bit shorts + + // 4 pixel loop small blocks. + align 4 + s4: + // top left + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + + // - top right + psubd xmm0, [eax + edx * 4] + psubd xmm1, [eax + edx * 4 + 16] + psubd xmm2, [eax + edx * 4 + 32] + psubd xmm3, [eax + edx * 4 + 48] + lea eax, [eax + 64] + + // - bottom left + psubd xmm0, [esi] + psubd xmm1, [esi + 16] + psubd xmm2, [esi + 32] + psubd xmm3, [esi + 48] + + // + bottom right + paddd xmm0, [esi + edx * 4] + paddd xmm1, [esi + edx * 4 + 16] + paddd xmm2, [esi + edx * 4 + 32] + paddd xmm3, [esi + edx * 4 + 48] + lea esi, [esi + 64] + + packssdw xmm0, xmm1 // pack 4 pixels into 2 registers + packssdw xmm2, xmm3 + + pmulhuw xmm0, xmm5 + pmulhuw xmm2, xmm5 + + packuswb xmm0, xmm2 + movdqu [edi], xmm0 + lea edi, [edi + 16] + sub ecx, 4 + jge s4 + + jmp l4b + // 4 pixel loop align 4 l4: @@ -5761,7 +5913,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, mov edx, cumsum mov esi, previous_cumsum mov ecx, width - sub esi, edx pxor xmm0, xmm0 pxor xmm1, xmm1 @@ -5788,19 +5939,20 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, punpckhwd xmm5, xmm1 paddd xmm0, xmm2 - movdqa xmm2, [edx + esi] // previous row above. + movdqa xmm2, [esi] // previous row above. paddd xmm2, xmm0 paddd xmm0, xmm3 - movdqa xmm3, [edx + esi + 16] + movdqa xmm3, [esi + 16] paddd xmm3, xmm0 paddd xmm0, xmm4 - movdqa xmm4, [edx + esi + 32] + movdqa xmm4, [esi + 32] paddd xmm4, xmm0 paddd xmm0, xmm5 - movdqa xmm5, [edx + esi + 48] + movdqa xmm5, [esi + 48] + lea esi, [esi + 64] paddd xmm5, xmm0 movdqa [edx], xmm2 @@ -5824,7 +5976,8 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, punpcklbw xmm2, xmm1 punpcklwd xmm2, xmm1 paddd xmm0, xmm2 - movdqu xmm2, [edx + esi] + movdqu xmm2, [esi] + lea esi, [esi + 16] paddd xmm2, xmm0 movdqu [edx], xmm2 lea edx, [edx + 16] @@ -5845,7 +5998,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, __asm { push esi push edi - mov eax, [esp + 12] // src_argb + mov eax, [esp + 12] // src_argb mov esi, [esp + 16] // stride mov edx, [esp + 20] // dst_argb mov ecx, [esp + 24] // pointer to uv_dudv @@ -5923,6 +6076,108 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, } #endif // HAS_ARGBAFFINEROW_SSE2 +#ifdef HAS_INTERPOLATEROW_AVX2 +// Bilinear filter 16x2 -> 16x1 +__declspec(naked) __declspec(align(16)) +void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + shr eax, 1 + // Dispatch to specialized filters if applicable. + cmp eax, 0 + je xloop100 // 0 / 128. Blend 100 / 0. + sub edi, esi + cmp eax, 32 + je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. + cmp eax, 64 + je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. + cmp eax, 96 + je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. + + vmovd xmm0, eax // high fraction 0..127 + neg eax + add eax, 128 + vmovd xmm5, eax // low fraction 128..1 + vpunpcklbw xmm5, xmm5, xmm0 + vpunpcklwd xmm5, xmm5, xmm5 + vpxor ymm0, ymm0, ymm0 + vpermd ymm5, ymm0, ymm5 + + align 4 + xloop: + vmovdqu ymm0, [esi] + vmovdqu ymm2, [esi + edx] + vpunpckhbw ymm1, ymm0, ymm2 // mutates + vpunpcklbw ymm0, ymm0, ymm2 // mutates + vpmaddubsw ymm0, ymm0, ymm5 + vpmaddubsw ymm1, ymm1, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm1, ymm1, 7 + vpackuswb ymm0, ymm0, ymm1 // unmutates + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 4 + xloop25: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + vpavgb ymm0, ymm0, [esi + edx] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 4 + xloop50: + vmovdqu ymm0, [esi] + vpavgb ymm0, ymm0, [esi + edx] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 4 + xloop75: + vmovdqu ymm0, [esi + edx] + vpavgb ymm0, ymm0, [esi] + vpavgb ymm0, ymm0, [esi] + sub ecx, 32 + vmovdqu [esi + edi], ymm0 + lea esi, [esi + 32] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 4 + xloop100: + rep movsb + + xloop99: + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_INTERPOLATEROW_AVX2 + +#ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, @@ -5956,7 +6211,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 - align 16 + align 4 xloop: movdqa xmm0, [esi] movdqa xmm2, [esi + edx] @@ -5975,7 +6230,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 25 / 75. - align 16 + align 4 xloop25: movdqa xmm0, [esi] movdqa xmm1, [esi + edx] @@ -5988,7 +6243,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 50 / 50. - align 16 + align 4 xloop50: movdqa xmm0, [esi] movdqa xmm1, [esi + edx] @@ -6000,7 +6255,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 75 / 25. - align 16 + align 4 xloop75: movdqa xmm1, [esi] movdqa xmm0, [esi + edx] @@ -6013,7 +6268,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 100 / 0 - Copy row unchanged. - align 16 + align 4 xloop100: movdqa xmm0, [esi] sub ecx, 16 @@ -6027,6 +6282,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ret } } +#endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 @@ -6061,7 +6317,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, punpcklqdq xmm5, xmm5 pxor xmm4, xmm4 - align 16 + align 4 xloop: movdqa xmm0, [esi] // row0 movdqa xmm2, [esi + edx] // row1 @@ -6087,7 +6343,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 25 / 75. - align 16 + align 4 xloop25: movdqa xmm0, [esi] movdqa xmm1, [esi + edx] @@ -6100,7 +6356,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 50 / 50. - align 16 + align 4 xloop50: movdqa xmm0, [esi] movdqa xmm1, [esi + edx] @@ -6112,7 +6368,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 75 / 25. - align 16 + align 4 xloop75: movdqa xmm1, [esi] movdqa xmm0, [esi + edx] @@ -6125,7 +6381,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 100 / 0 - Copy row unchanged. - align 16 + align 4 xloop100: movdqa xmm0, [esi] sub ecx, 16 @@ -6174,7 +6430,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 - align 16 + align 4 xloop: movdqu xmm0, [esi] movdqu xmm2, [esi + edx] @@ -6193,7 +6449,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 25 / 75. - align 16 + align 4 xloop25: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -6206,7 +6462,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 50 / 50. - align 16 + align 4 xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -6218,7 +6474,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 75 / 25. - align 16 + align 4 xloop75: movdqu xmm1, [esi] movdqu xmm0, [esi + edx] @@ -6231,7 +6487,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 100 / 0 - Copy row unchanged. - align 16 + align 4 xloop100: movdqu xmm0, [esi] sub ecx, 16 @@ -6279,7 +6535,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, punpcklqdq xmm5, xmm5 pxor xmm4, xmm4 - align 16 + align 4 xloop: movdqu xmm0, [esi] // row0 movdqu xmm2, [esi + edx] // row1 @@ -6305,7 +6561,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 25 / 75. - align 16 + align 4 xloop25: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -6318,7 +6574,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 50 / 50. - align 16 + align 4 xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -6330,7 +6586,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 75 / 25. - align 16 + align 4 xloop75: movdqu xmm1, [esi] movdqu xmm0, [esi + edx] @@ -6343,7 +6599,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, jmp xloop99 // Blend 100 / 0 - Copy row unchanged. - align 16 + align 4 xloop100: movdqu xmm0, [esi] sub ecx, 16 @@ -6370,7 +6626,7 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, mov ecx, [esp + 4 + 16] // pix sub edi, eax - align 16 + align 4 convertloop: movdqa xmm0, [eax] pavgb xmm0, [eax + edx] @@ -6395,7 +6651,7 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, mov ecx, [esp + 4 + 16] // pix sub edi, eax - align 16 + align 4 convertloop: vmovdqu ymm0, [eax] vpavgb ymm0, ymm0, [eax + edx] @@ -6421,7 +6677,7 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, mov ecx, [esp + 16] // pix pshufd xmm5, xmm5, 0 - align 16 + align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -6437,18 +6693,49 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, } } +// Specialized ARGB to Bayer that just isolates G channel. +__declspec(naked) __declspec(align(16)) +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_bayer + // selector + mov ecx, [esp + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + psrld xmm5, 24 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrld xmm0, 8 // Move green to bottom. + psrld xmm1, 8 + pand xmm0, xmm5 + pand xmm1, xmm5 + packssdw xmm0, xmm1 + packuswb xmm0, xmm1 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + ret + } +} + // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. __declspec(naked) __declspec(align(16)) void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqa xmm5, [ecx] mov ecx, [esp + 16] // pix - align 16 + align 4 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] @@ -6469,12 +6756,12 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqa xmm5, [ecx] mov ecx, [esp + 16] // pix - align 16 + align 4 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] @@ -6496,13 +6783,12 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler - vmovdqa xmm5, [ecx] - vpermq ymm5, ymm5, 0x44 // same shuffle in high as low. + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. mov ecx, [esp + 16] // pix - align 16 + align 4 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] @@ -6519,7 +6805,127 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, ret } } -#endif +#endif // HAS_ARGBSHUFFLEROW_AVX2 + +__declspec(naked) __declspec(align(16)) +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + __asm { + push ebx + push esi + mov eax, [esp + 8 + 4] // src_argb + mov edx, [esp + 8 + 8] // dst_argb + mov esi, [esp + 8 + 12] // shuffler + mov ecx, [esp + 8 + 16] // pix + pxor xmm5, xmm5 + + mov ebx, [esi] // shuffler + cmp ebx, 0x03000102 + je shuf_3012 + cmp ebx, 0x00010203 + je shuf_0123 + cmp ebx, 0x00030201 + je shuf_0321 + cmp ebx, 0x02010003 + je shuf_2103 + + // TODO(fbarchard): Use one source pointer and 3 offsets. + shuf_any1: + movzx ebx, byte ptr [esi] + movzx ebx, byte ptr [eax + ebx] + mov [edx], bl + movzx ebx, byte ptr [esi + 1] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 1], bl + movzx ebx, byte ptr [esi + 2] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 2], bl + movzx ebx, byte ptr [esi + 3] + movzx ebx, byte ptr [eax + ebx] + mov [edx + 3], bl + lea eax, [eax + 4] + lea edx, [edx + 4] + sub ecx, 1 + jg shuf_any1 + jmp shuf99 + + align 4 + shuf_0123: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB + pshuflw xmm0, xmm0, 01Bh + pshufhw xmm1, xmm1, 01Bh + pshuflw xmm1, xmm1, 01Bh + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0123 + jmp shuf99 + + align 4 + shuf_0321: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB + pshuflw xmm0, xmm0, 039h + pshufhw xmm1, xmm1, 039h + pshuflw xmm1, xmm1, 039h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_0321 + jmp shuf99 + + align 4 + shuf_2103: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA + pshuflw xmm0, xmm0, 093h + pshufhw xmm1, xmm1, 093h + pshuflw xmm1, xmm1, 093h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_2103 + jmp shuf99 + + align 4 + shuf_3012: + movdqu xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm5 + punpckhbw xmm1, xmm5 + pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB + pshuflw xmm0, xmm0, 0C6h + pshufhw xmm1, xmm1, 0C6h + pshuflw xmm1, xmm1, 0C6h + packuswb xmm0, xmm1 + sub ecx, 4 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg shuf_3012 + + shuf99: + pop esi + pop ebx + ret + } +} // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... @@ -6542,7 +6948,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, mov ecx, [esp + 8 + 20] // width sub edx, esi - align 16 + align 4 convertloop: movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V @@ -6580,7 +6986,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, mov ecx, [esp + 8 + 20] // width sub edx, esi - align 16 + align 4 convertloop: movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V @@ -6602,6 +7008,289 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, ret } } + +#ifdef HAS_FIXEDDIV_X86 +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) __declspec(align(16)) +int FixedDiv_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + idiv dword ptr [esp + 8] + ret + } +} +#endif // HAS_FIXEDDIV_X86 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* src_argb */ + mov edx, [esp + 4 + 8] /* dst_argb */ + mov esi, [esp + 4 + 12] /* poly */ + mov ecx, [esp + 4 + 16] /* width */ + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. + + // 2 pixel loop. + align 4 + convertloop: +// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel +// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + movq xmm0, qword ptr [eax] // BGRABGRA + lea eax, [eax + 8] + punpcklbw xmm0, xmm3 + movdqa xmm4, xmm0 + punpcklwd xmm0, xmm3 // pixel 0 + punpckhwd xmm4, xmm3 // pixel 1 + cvtdq2ps xmm0, xmm0 // 4 floats + cvtdq2ps xmm4, xmm4 + movdqa xmm1, xmm0 // X + movdqa xmm5, xmm4 + mulps xmm0, [esi + 16] // C1 * X + mulps xmm4, [esi + 16] + addps xmm0, [esi] // result = C0 + C1 * X + addps xmm4, [esi] + movdqa xmm2, xmm1 + movdqa xmm6, xmm5 + mulps xmm2, xmm1 // X * X + mulps xmm6, xmm5 + mulps xmm1, xmm2 // X * X * X + mulps xmm5, xmm6 + mulps xmm2, [esi + 32] // C2 * X * X + mulps xmm6, [esi + 32] + mulps xmm1, [esi + 48] // C3 * X * X * X + mulps xmm5, [esi + 48] + addps xmm0, xmm2 // result += C2 * X * X + addps xmm4, xmm6 + addps xmm0, xmm1 // result += C3 * X * X * X + addps xmm4, xmm5 + cvttps2dq xmm0, xmm0 + cvttps2dq xmm4, xmm4 + packuswb xmm0, xmm4 + packuswb xmm0, xmm0 + sub ecx, 2 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 + vbroadcastf128 ymm5, [ecx + 16] // C1 + vbroadcastf128 ymm6, [ecx + 32] // C2 + vbroadcastf128 ymm7, [ecx + 48] // C3 + mov ecx, [esp + 16] /* width */ + + // 2 pixel loop. + align 4 + convertloop: + vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + vcvtdq2ps ymm0, ymm0 // X 8 floats + vmulps ymm2, ymm0, ymm0 // X * X + vmulps ymm3, ymm0, ymm7 // C3 * X + vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X + vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X + vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 + sub ecx, 2 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + movzx edx, byte ptr [eax - 4 + 3] + movzx edx, byte ptr [esi + edx * 4 + 3] + mov byte ptr [eax - 4 + 3], dl + dec ecx + jg convertloop + pop esi + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 + +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + dec ecx + jg convertloop + + pop esi + ret + } +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +__declspec(naked) __declspec(align(16)) +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + mov ecx, [esp + 8 + 12] /* width */ + movd xmm2, dword ptr [esp + 8 + 16] // luma table + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + psllw xmm4, 8 + pxor xmm5, xmm5 + + // 4 pixel loop. + align 4 + convertloop: + movdqu xmm0, qword ptr [eax] // generate luma ptr + pmaddubsw xmm0, xmm3 + phaddw xmm0, xmm0 + pand xmm0, xmm4 // mask out low bits + punpcklwd xmm0, xmm5 + paddd xmm0, xmm2 // add table base + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi], dl + movzx edx, byte ptr [eax + 1] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 1], dl + movzx edx, byte ptr [eax + 2] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 2], dl + movzx edx, byte ptr [eax + 3] // copy alpha. + mov byte ptr [edi + 3], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 4] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 4], dl + movzx edx, byte ptr [eax + 5] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 5], dl + movzx edx, byte ptr [eax + 6] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 6], dl + movzx edx, byte ptr [eax + 7] // copy alpha. + mov byte ptr [edi + 7], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 8] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 8], dl + movzx edx, byte ptr [eax + 9] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 9], dl + movzx edx, byte ptr [eax + 10] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 10], dl + movzx edx, byte ptr [eax + 11] // copy alpha. + mov byte ptr [edi + 11], dl + + movd esi, xmm0 + + movzx edx, byte ptr [eax + 12] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 12], dl + movzx edx, byte ptr [eax + 13] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 13], dl + movzx edx, byte ptr [eax + 14] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 14], dl + movzx edx, byte ptr [eax + 15] // copy alpha. + mov byte ptr [edi + 15], dl + + sub ecx, 4 + lea eax, [eax + 16] + lea edi, [edi + 16] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/row_x86.asm b/chromium/third_party/libyuv/source/row_x86.asm index 80a9716bae2..0cb326f8e58 100644 --- a/chromium/third_party/libyuv/source/row_x86.asm +++ b/chromium/third_party/libyuv/source/row_x86.asm @@ -28,7 +28,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix psrlw m2, m2, 8 %endif - ALIGN 16 + ALIGN 4 .convertloop: mov%2 m0, [src_yuy2q] mov%2 m1, [src_yuy2q + mmsize] @@ -74,7 +74,7 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix psrlw m4, m4, 8 sub dst_vq, dst_uq - ALIGN 16 + ALIGN 4 .convertloop: mov%1 m0, [src_uvq] mov%1 m1, [src_uvq + mmsize] @@ -113,7 +113,7 @@ SplitUVRow a, cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix sub src_vq, src_uq - ALIGN 16 + ALIGN 4 .convertloop: mov%1 m0, [src_uq] mov%1 m1, [src_vq] diff --git a/chromium/third_party/libyuv/source/scale.cc b/chromium/third_party/libyuv/source/scale.cc index 77af420b3f3..4f19742a205 100644 --- a/chromium/third_party/libyuv/source/scale.cc +++ b/chromium/third_party/libyuv/source/scale.cc @@ -16,1657 +16,21 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyPlane #include "libyuv/row.h" +#include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif +// Remove this macro if OVERREAD is safe. +#define AVOID_OVERREAD 1 + static __inline int Abs(int v) { return v >= 0 ? v : -v; } -static __inline int Half(int v) { - return v >= 0 ? ((v + 1) >> 1) : -((-v + 1) >> 1); -} - -// Note: Some SSE2 reference manuals -// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf - -// Set the following flag to true to revert to only -// using the reference implementation ScalePlaneBox(), and -// NOT the optimized versions. Useful for debugging and -// when comparing the quality of the resulting YUV planes -// as produced by the optimized and non-optimized versions. -static bool use_reference_impl_ = false; - -LIBYUV_API -void SetUseReferenceImpl(bool use) { - use_reference_impl_ = use; -} - -// ScaleRowDown2Int also used by planar functions -// NEON downscalers with interpolation. - -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_SCALEROWDOWN2_NEON -// Note - not static due to reuse in convert for 444 to 420. -void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); - -void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); - -#define HAS_SCALEROWDOWN4_NEON -void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width); -void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -#define HAS_SCALEROWDOWN34_NEON -// Down scale from 4 to 3 pixels. Use the neon multilane read/write -// to load up the every 4th pixel into a 4 different registers. -// Point samples 32 pixels to 24 pixels. -void ScaleRowDown34_NEON(const uint8* src_ptr, - ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -#define HAS_SCALEROWDOWN38_NEON -// 32 -> 12 -void ScaleRowDown38_NEON(const uint8* src_ptr, - ptrdiff_t /* src_stride */, - uint8* dst_ptr, int dst_width); -// 32x3 -> 12x1 -void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -// 32x2 -> 12x1 -void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); - -// SSE2 downscalers with interpolation. -// Constants for SSSE3 code -#elif !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) -// GCC 4.2 on OSX has link error when passing static or const to inline. -// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. -#ifdef __APPLE__ -#define CONST -#else -#define CONST static const -#endif - -// Offsets for source bytes 0 to 9 -CONST uvec8 kShuf0 = - { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. -CONST uvec8 kShuf1 = - { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -CONST uvec8 kShuf2 = - { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Offsets for source bytes 0 to 10 -CONST uvec8 kShuf01 = - { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; - -// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. -CONST uvec8 kShuf11 = - { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; - -// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. -CONST uvec8 kShuf21 = - { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; - -// Coefficients for source bytes 0 to 10 -CONST uvec8 kMadd01 = - { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; - -// Coefficients for source bytes 10 to 21 -CONST uvec8 kMadd11 = - { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; - -// Coefficients for source bytes 21 to 31 -CONST uvec8 kMadd21 = - { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; - -// Coefficients for source bytes 21 to 31 -CONST vec16 kRound34 = - { 2, 2, 2, 2, 2, 2, 2, 2 }; - -CONST uvec8 kShuf38a = - { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - -CONST uvec8 kShuf38b = - { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; - -// Arrange words 0,3,6 into 0,1,2 -CONST uvec8 kShufAc = - { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - -// Arrange words 0,3,6 into 3,4,5 -CONST uvec8 kShufAc3 = - { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; - -// Scaling values for boxes of 3x3 and 2x3 -CONST uvec16 kScaleAc33 = - { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; - -// Arrange first value for pixels 0,1,2,3,4,5 -CONST uvec8 kShufAb0 = - { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; - -// Arrange second value for pixels 0,1,2,3,4,5 -CONST uvec8 kShufAb1 = - { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; - -// Arrange third value for pixels 0,1,2,3,4,5 -CONST uvec8 kShufAb2 = - { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; - -// Scaling values for boxes of 3x2 and 2x2 -CONST uvec16 kScaleAb2 = - { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; -#endif - -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_SCALEROWDOWN2_SSE2 -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -// Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - - align 16 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - - align 16 - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - movdqu xmm2, [eax + esi] - movdqu xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm5 - pand xmm3, xmm5 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - sub ecx, 16 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -#define HAS_SCALEROWDOWN4_SSE2 -// Point samples 32 pixels to 8 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 - psrld xmm5, 24 - pslld xmm5, 16 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 - psrlw xmm0, 8 - packuswb xmm0, xmm0 - sub ecx, 8 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - jg wloop - - ret - } -} - -// Blends 32x4 rectangle to 8x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_ptr - mov esi, [esp + 8 + 8] // src_stride - mov edx, [esp + 8 + 12] // dst_ptr - mov ecx, [esp + 8 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff - psrlw xmm7, 8 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, [eax + esi * 2] - movdqa xmm3, [eax + esi * 2 + 16] - movdqa xmm4, [eax + edi] - movdqa xmm5, [eax + edi + 16] - lea eax, [eax + 32] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) - psrlw xmm0, 8 - movdqa xmm3, xmm1 - psrlw xmm1, 8 - pand xmm2, xmm7 - pand xmm3, xmm7 - pavgw xmm0, xmm2 - pavgw xmm1, xmm3 - packuswb xmm0, xmm1 - - movdqa xmm2, xmm0 // average columns (16 to 8 pixels) - psrlw xmm0, 8 - pand xmm2, xmm7 - pavgw xmm0, xmm2 - packuswb xmm0, xmm0 - - sub ecx, 8 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - jg wloop - - pop edi - pop esi - ret - } -} - -#define HAS_SCALEROWDOWN34_SSSE3 -// Point samples 32 pixels to 24 pixels. -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm3, kShuf0 - movdqa xmm4, kShuf1 - movdqa xmm5, kShuf2 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - movdqa xmm2, xmm1 - palignr xmm1, xmm0, 8 - pshufb xmm0, xmm3 - pshufb xmm1, xmm4 - pshufb xmm2, xmm5 - movq qword ptr [edx], xmm0 - movq qword ptr [edx + 8], xmm1 - movq qword ptr [edx + 16], xmm2 - lea edx, [edx + 24] - sub ecx, 24 - jg wloop - - ret - } -} - -// Blends 32x2 rectangle to 24x1 -// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. -// Then shuffled to do the scaling. - -// Register usage: -// xmm0 src_row 0 -// xmm1 src_row 1 -// xmm2 shuf 0 -// xmm3 shuf 1 -// xmm4 shuf 2 -// xmm5 madd 0 -// xmm6 madd 1 -// xmm7 kRound34 - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShuf01 - movdqa xmm3, kShuf11 - movdqa xmm4, kShuf21 - movdqa xmm5, kMadd01 - movdqa xmm6, kMadd11 - movdqa xmm7, kRound34 - - align 16 - wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - sub ecx, 24 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx + 24] - jg wloop - - pop esi - ret - } -} - -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShuf01 - movdqa xmm3, kShuf11 - movdqa xmm4, kShuf21 - movdqa xmm5, kMadd01 - movdqa xmm6, kMadd11 - movdqa xmm7, kRound34 - - align 16 - wloop: - movdqa xmm0, [eax] // pixels 0..7 - movdqa xmm1, [eax + esi] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm2 - pmaddubsw xmm0, xmm5 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx], xmm0 - movdqu xmm0, [eax + 8] // pixels 8..15 - movdqu xmm1, [eax + esi + 8] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm3 - pmaddubsw xmm0, xmm6 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - movq qword ptr [edx + 8], xmm0 - movdqa xmm0, [eax + 16] // pixels 16..23 - movdqa xmm1, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm1, xmm0 - pavgb xmm0, xmm1 - pshufb xmm0, xmm4 - movdqa xmm1, kMadd21 - pmaddubsw xmm0, xmm1 - paddsw xmm0, xmm7 - psrlw xmm0, 2 - packuswb xmm0, xmm0 - sub ecx, 24 - movq qword ptr [edx + 16], xmm0 - lea edx, [edx+24] - jg wloop - - pop esi - ret - } -} - -#define HAS_SCALEROWDOWN38_SSSE3 -// 3/8 point sampler - -// Scale 32 pixels to 12 -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - movdqa xmm4, kShuf38a - movdqa xmm5, kShuf38b - - align 16 - xloop: - movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 - lea eax, [eax + 32] - pshufb xmm0, xmm4 - pshufb xmm1, xmm5 - paddusb xmm0, xmm1 - - sub ecx, 12 - movq qword ptr [edx], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edx + 8], xmm1 - lea edx, [edx + 12] - jg xloop - - ret - } -} - -// Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShufAc - movdqa xmm3, kShufAc3 - movdqa xmm4, kScaleAc33 - pxor xmm5, xmm5 - - align 16 - xloop: - movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 - movdqa xmm6, [eax + esi] - movhlps xmm1, xmm0 - movhlps xmm7, xmm6 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - movdqa xmm6, [eax + esi * 2] - lea eax, [eax + 16] - movhlps xmm7, xmm6 - punpcklbw xmm6, xmm5 - punpcklbw xmm7, xmm5 - paddusw xmm0, xmm6 - paddusw xmm1, xmm7 - - movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - psrldq xmm0, 2 - paddusw xmm6, xmm0 - pshufb xmm6, xmm2 - - movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - psrldq xmm1, 2 - paddusw xmm7, xmm1 - pshufb xmm7, xmm3 - paddusw xmm6, xmm7 - - pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 - packuswb xmm6, xmm6 - - sub ecx, 6 - movd [edx], xmm6 // write 6 pixels - psrlq xmm6, 16 - movd [edx + 2], xmm6 - lea edx, [edx + 6] - jg xloop - - pop esi - ret - } -} - -// Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_ptr - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_ptr - mov ecx, [esp + 4 + 16] // dst_width - movdqa xmm2, kShufAb0 - movdqa xmm3, kShufAb1 - movdqa xmm4, kShufAb2 - movdqa xmm5, kScaleAb2 - - align 16 - xloop: - movdqa xmm0, [eax] // average 2 rows into xmm0 - pavgb xmm0, [eax + esi] - lea eax, [eax + 16] - - movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 - pshufb xmm1, xmm2 - movdqa xmm6, xmm0 - pshufb xmm6, xmm3 - paddusw xmm1, xmm6 - pshufb xmm0, xmm4 - paddusw xmm1, xmm0 - - pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 - packuswb xmm1, xmm1 - - sub ecx, 6 - movd [edx], xmm1 // write 6 pixels - psrlq xmm1, 16 - movd [edx + 2], xmm1 - lea edx, [edx + 6] - jg xloop - - pop esi - ret - } -} - -#define HAS_SCALEADDROWS_SSE2 - -// Reads 16xN bytes and produces 16 shorts at a time. -__declspec(naked) __declspec(align(16)) -static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, - int src_height) { - __asm { - push esi - push edi - push ebx - push ebp - mov esi, [esp + 16 + 4] // src_ptr - mov edx, [esp + 16 + 8] // src_stride - mov edi, [esp + 16 + 12] // dst_ptr - mov ecx, [esp + 16 + 16] // dst_width - mov ebx, [esp + 16 + 20] // height - pxor xmm4, xmm4 - dec ebx - - align 16 - xloop: - // first row - movdqa xmm0, [esi] - lea eax, [esi + edx] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - lea esi, [esi + 16] - mov ebp, ebx - test ebp, ebp - je ydone - - // sum remaining rows - align 16 - yloop: - movdqa xmm2, [eax] // read 16 pixels - lea eax, [eax + edx] // advance to next row - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 - paddusw xmm0, xmm2 // sum 16 words - paddusw xmm1, xmm3 - sub ebp, 1 - jg yloop - ydone: - movdqa [edi], xmm0 - movdqa [edi + 16], xmm1 - lea edi, [edi + 32] - - sub ecx, 16 - jg xloop - - pop ebp - pop ebx - pop edi - pop esi - ret - } -} - -#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -// GCC versions of row functions are verbatim conversions from Visual C. -// Generated using gcc disassembly on Visual C object file: -// objdump -D yuvscaler.obj >yuvscaler.txt -#define HAS_SCALEROWDOWN2_SSE2 -static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%3,1),%%xmm2 \n" - "movdqa 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" - ".p2align 4 \n" - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu (%0,%3,1),%%xmm2 \n" - "movdqu 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm5,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" -#endif - ); -} - -#define HAS_SCALEROWDOWN4_SSE2 -static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - "pslld $0x10,%%xmm5 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t stridex3 = 0; - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0x8,%%xmm7 \n" - "lea (%4,%4,2),%3 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%4,1),%%xmm2 \n" - "movdqa 0x10(%0,%4,1),%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa (%0,%4,2),%%xmm2 \n" - "movdqa 0x10(%0,%4,2),%%xmm3 \n" - "movdqa (%0,%3,1),%%xmm4 \n" - "movdqa 0x10(%0,%3,1),%%xmm5 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "movdqa %%xmm1,%%xmm3 \n" - "psrlw $0x8,%%xmm1 \n" - "pand %%xmm7,%%xmm2 \n" - "pand %%xmm7,%%xmm3 \n" - "pavgw %%xmm2,%%xmm0 \n" - "pavgw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "pavgw %%xmm2,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(stridex3) // %3 - : "r"(static_cast<intptr_t>(src_stride)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" -#endif - ); -} - -#define HAS_SCALEROWDOWN34_SSSE3 -static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm3 \n" - "movdqa %1,%%xmm4 \n" - "movdqa %2,%%xmm5 \n" - : - : "m"(kShuf0), // %0 - "m"(kShuf1), // %1 - "m"(kShuf2) // %2 - ); - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm2 \n" - "lea 0x20(%0),%0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "palignr $0x8,%%xmm0,%%xmm1 \n" - "pshufb %%xmm3,%%xmm0 \n" - "pshufb %%xmm4,%%xmm1 \n" - "pshufb %%xmm5,%%xmm2 \n" - "movq %%xmm0,(%1) \n" - "movq %%xmm1,0x8(%1) \n" - "movq %%xmm2,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm6 \n" - "movdqa (%0,%3),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3),%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqa 0x10(%0),%%xmm6 \n" - "movdqa 0x10(%0,%3),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" // kShuf01 - "movdqa %1,%%xmm3 \n" // kShuf11 - "movdqa %2,%%xmm4 \n" // kShuf21 - : - : "m"(kShuf01), // %0 - "m"(kShuf11), // %1 - "m"(kShuf21) // %2 - ); - asm volatile ( - "movdqa %0,%%xmm5 \n" // kMadd01 - "movdqa %1,%%xmm0 \n" // kMadd11 - "movdqa %2,%%xmm1 \n" // kRound34 - : - : "m"(kMadd01), // %0 - "m"(kMadd11), // %1 - "m"(kRound34) // %2 - ); - - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm6 \n" - "movdqa (%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "pmaddubsw %%xmm5,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,(%1) \n" - "movdqu 0x8(%0),%%xmm6 \n" - "movdqu 0x8(%0,%3,1),%%xmm7 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "pmaddubsw %%xmm0,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x8(%1) \n" - "movdqa 0x10(%0),%%xmm6 \n" - "movdqa 0x10(%0,%3,1),%%xmm7 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm6,%%xmm7 \n" - "pavgb %%xmm7,%%xmm6 \n" - "pshufb %%xmm4,%%xmm6 \n" - "pmaddubsw %4,%%xmm6 \n" - "paddsw %%xmm1,%%xmm6 \n" - "psrlw $0x2,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "movq %%xmm6,0x10(%1) \n" - "lea 0x18(%1),%1 \n" - "sub $0x18,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)), // %3 - "m"(kMadd21) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -#define HAS_SCALEROWDOWN38_SSSE3 -static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movhlps %%xmm0,%%xmm1 \n" - "movd %%xmm1,0x8(%1) \n" - "lea 0xc(%1),%1 \n" - "sub $0xc,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "m"(kShuf38a), // %3 - "m"(kShuf38b) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm4", "xmm5" -#endif - ); -} - -static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "movdqa %3,%%xmm5 \n" - : - : "m"(kShufAb0), // %0 - "m"(kShufAb1), // %1 - "m"(kShufAb2), // %2 - "m"(kScaleAb2) // %3 - ); - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "pavgb (%0,%3,1),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pshufb %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "pshufb %%xmm3,%%xmm6 \n" - "paddusw %%xmm6,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "paddusw %%xmm0,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm1 \n" - "sub $0x6,%2 \n" - "movd %%xmm1,(%1) \n" - "psrlq $0x10,%%xmm1 \n" - "movd %%xmm1,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif - ); -} - -static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "movdqa %0,%%xmm2 \n" - "movdqa %1,%%xmm3 \n" - "movdqa %2,%%xmm4 \n" - "pxor %%xmm5,%%xmm5 \n" - : - : "m"(kShufAc), // %0 - "m"(kShufAc3), // %1 - "m"(kScaleAc33) // %2 - ); - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa (%0,%3,1),%%xmm6 \n" - "movhlps %%xmm0,%%xmm1 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa (%0,%3,2),%%xmm6 \n" - "lea 0x10(%0),%0 \n" - "movhlps %%xmm6,%%xmm7 \n" - "punpcklbw %%xmm5,%%xmm6 \n" - "punpcklbw %%xmm5,%%xmm7 \n" - "paddusw %%xmm6,%%xmm0 \n" - "paddusw %%xmm7,%%xmm1 \n" - "movdqa %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "psrldq $0x2,%%xmm0 \n" - "paddusw %%xmm0,%%xmm6 \n" - "pshufb %%xmm2,%%xmm6 \n" - "movdqa %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "psrldq $0x2,%%xmm1 \n" - "paddusw %%xmm1,%%xmm7 \n" - "pshufb %%xmm3,%%xmm7 \n" - "paddusw %%xmm7,%%xmm6 \n" - "pmulhuw %%xmm4,%%xmm6 \n" - "packuswb %%xmm6,%%xmm6 \n" - "sub $0x6,%2 \n" - "movd %%xmm6,(%1) \n" - "psrlq $0x10,%%xmm6 \n" - "movd %%xmm6,0x2(%1) \n" - "lea 0x6(%1),%1 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - -#define HAS_SCALEADDROWS_SSE2 -static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - int tmp_height = 0; - intptr_t tmp_src = 0; - asm volatile ( - "pxor %%xmm4,%%xmm4 \n" - "sub $0x1,%5 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "mov %0,%3 \n" - "add %6,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "mov %5,%2 \n" - "test %2,%2 \n" - "je 3f \n" - "2: \n" - "movdqa (%0),%%xmm2 \n" - "add %6,%0 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "sub $0x1,%2 \n" - "jg 2b \n" - "3: \n" - "movdqa %%xmm0,(%1) \n" - "movdqa %%xmm1,0x10(%1) \n" - "lea 0x10(%3),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_height), // %2 - "+r"(tmp_src), // %3 - "+r"(src_width), // %4 - "+rm"(src_height) // %5 - : "rm"(static_cast<intptr_t>(src_stride)) // %6 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" -#endif - ); -} - -#endif // defined(__x86_64__) || defined(__i386__) - -#if !defined(LIBYUV_DISABLE_MIPS) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_SCALEROWDOWN2_MIPS_DSPR2 -void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -#define HAS_SCALEROWDOWN4_MIPS_DSPR2 -void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -#define HAS_SCALEROWDOWN34_MIPS_DSPR2 -void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width); -#define HAS_SCALEROWDOWN38_MIPS_DSPR2 -void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width); -#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) - -// CPU agnostic row functions -static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - uint8* dend = dst + dst_width - 1; - do { - dst[0] = src_ptr[1]; - dst[1] = src_ptr[3]; - dst += 2; - src_ptr += 4; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = src_ptr[1]; - } -} - -void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; - uint8* dend = dst + dst_width - 1; - do { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; - dst += 2; - s += 4; - t += 4; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; - } -} - -static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - uint8* dend = dst + dst_width - 1; - do { - dst[0] = src_ptr[2]; - dst[1] = src_ptr[6]; - dst += 2; - src_ptr += 8; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = src_ptr[2]; - } -} - -static void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - intptr_t stride = src_stride; - uint8* dend = dst + dst_width - 1; - do { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; - dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + - src_ptr[stride + 4] + src_ptr[stride + 5] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + - src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + - src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + - 8) >> 4; - dst += 2; - src_ptr += 8; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride + 3] + - src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + - src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + - src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + - 8) >> 4; - } -} - -static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - uint8* dend = dst + dst_width; - do { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[1]; - dst[2] = src_ptr[3]; - dst += 3; - src_ptr += 4; - } while (dst < dend); -} - -// Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; - uint8* dend = d + dst_width; - do { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 * 3 + b0 + 2) >> 2; - d[1] = (a1 * 3 + b1 + 2) >> 2; - d[2] = (a2 * 3 + b2 + 2) >> 2; - d += 3; - s += 4; - t += 4; - } while (d < dend); -} - -// Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* d, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - const uint8* s = src_ptr; - const uint8* t = src_ptr + src_stride; - uint8* dend = d + dst_width; - do { - uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; - uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; - uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; - uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; - uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; - uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; - d[0] = (a0 + b0 + 1) >> 1; - d[1] = (a1 + b1 + 1) >> 1; - d[2] = (a2 + b2 + 1) >> 1; - d += 3; - s += 4; - t += 4; - } while (d < dend); -} - -// (1-f)a + fb can be replaced with a + f(b-a) -#define BLENDER(a, b, f) (static_cast<int>(a) + \ - ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16)) - -static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) { - for (int j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - x += dx; - xi = x >> 16; - a = src_ptr[xi]; - b = src_ptr[xi + 1]; - dst_ptr[1] = BLENDER(a, b, x & 0xffff); - x += dx; - dst_ptr += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int a = src_ptr[xi]; - int b = src_ptr[xi + 1]; - dst_ptr[0] = BLENDER(a, b, x & 0xffff); - } -} - -static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - assert(dst_width % 3 == 0); - for (int x = 0; x < dst_width; x += 3) { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[3]; - dst[2] = src_ptr[6]; - dst += 3; - src_ptr += 8; - } -} - -// 8x3 -> 3x1 -static void ScaleRowDown38_3_Box_C(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - intptr_t stride = src_stride; - for (int i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + - src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * - (65536 / 9) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + - src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * - (65536 / 9) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7] + - src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * - (65536 / 6) >> 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -// 8x2 -> 3x1 -static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - intptr_t stride = src_stride; - for (int i = 0; i < dst_width; i += 3) { - dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + - src_ptr[stride + 0] + src_ptr[stride + 1] + - src_ptr[stride + 2]) * (65536 / 6) >> 16; - dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + - src_ptr[stride + 3] + src_ptr[stride + 4] + - src_ptr[stride + 5]) * (65536 / 6) >> 16; - dst_ptr[2] = (src_ptr[6] + src_ptr[7] + - src_ptr[stride + 6] + src_ptr[stride + 7]) * - (65536 / 4) >> 16; - src_ptr += 8; - dst_ptr += 3; - } -} - -void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { - assert(src_width > 0); - assert(src_height > 0); - for (int x = 0; x < src_width; ++x) { - const uint8* s = src_ptr + x; - int sum = 0; - for (int y = 0; y < src_height; ++y) { - sum += s[0]; - s += src_stride; - } - dst_ptr[x] = sum; - } -} +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) // Scale plane, 1/2 // This is an optimized version for scaling down a plane to 1/2 of @@ -1679,7 +43,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, FilterMode filtering) { void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = - filtering ? ScaleRowDown2Box_C : ScaleRowDown2_C; + filtering == kFilterNone ? ScaleRowDown2_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_C : + ScaleRowDown2Box_C); int row_stride = src_stride << 1; if (!filtering) { src_ptr += src_stride; // Point to odd rows. @@ -1692,12 +58,15 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, } #elif defined(HAS_SCALEROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_Unaligned_SSE2 : - ScaleRowDown2_Unaligned_SSE2; + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 : + ScaleRowDown2Box_Unaligned_SSE2); if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_SSE2 : ScaleRowDown2_SSE2; + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); } } #elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) @@ -1709,6 +78,9 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, } #endif + if (filtering == kFilterLinear) { + src_stride = 0; + } // TODO(fbarchard): Loop through source height to allow odd height. for (int y = 0; y < dst_height; ++y) { ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); @@ -1753,6 +125,9 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */, } #endif + if (filtering == kFilterLinear) { + src_stride = 0; + } for (int y = 0; y < dst_height; ++y) { ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += row_stride; @@ -1816,14 +191,15 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, } #endif + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; for (int y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; - ScaleRowDown34_0(src_ptr + src_stride, -src_stride, + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; @@ -1831,7 +207,7 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, // Remainder 1 or 2 rows with last row vertically unfiltered if ((dst_height % 3) == 2) { - ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); @@ -1908,21 +284,22 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */, } #endif + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; for (int y = 0; y < dst_height - 2; y += 3) { - ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; - ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; - ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } // Remainder 1 or 2 rows with last row vertically unfiltered if ((dst_height % 3) == 2) { - ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); @@ -1998,24 +375,22 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, // one pixel of destination using fixed point (16.16) to step // through source, sampling a box of pixel with simple // averaging. - +SAFEBUFFERS static void ScalePlaneBox(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr) { assert(dst_width > 0); assert(dst_height > 0); - int dx = (Abs(src_width) << 16) / dst_width; - int dy = (src_height << 16) / dst_height; + + // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; - // Negative src_width means horizontally mirror. - if (src_width < 0) { - x += (dst_width - 1) * dx; - dx = -dx; - src_width = -src_width; - } - int maxy = (src_height << 16); + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, + &x, &y, &dx, &dy); + const int max_y = (src_height << 16); if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxStride) || dst_height * 2 > src_height) { uint8* dst = dst_ptr; @@ -2023,8 +398,8 @@ static void ScalePlaneBox(int src_width, int src_height, int iy = y >> 16; const uint8* src = src_ptr + iy * src_stride; y += dy; - if (y > maxy) { - y = maxy; + if (y > max_y) { + y = max_y; } int boxheight = (y >> 16) - iy; ScalePlaneBoxRow_C(dst_width, boxheight, @@ -2046,6 +421,9 @@ static void ScalePlaneBox(int src_width, int src_height, } #if defined(HAS_SCALEADDROWS_SSE2) if (TestCpuFlag(kCpuHasSSE2) && +#ifdef AVOID_OVERREAD + IS_ALIGNED(src_width, 16) && +#endif IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { ScaleAddRows = ScaleAddRows_SSE2; } @@ -2066,16 +444,25 @@ static void ScalePlaneBox(int src_width, int src_height, } } -// Scale plane to/from any dimensions, with bilinear interpolation. - -void ScalePlaneBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +// Scale plane down with bilinear interpolation. +SAFEBUFFERS +void ScalePlaneBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { assert(dst_width > 0); assert(dst_height > 0); assert(Abs(src_width) <= kMaxStride); + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + SIMD_ALIGNED(uint8 row[kMaxStride + 16]); void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, @@ -2103,6 +490,14 @@ void ScalePlaneBilinear(int src_width, int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { InterpolateRow = InterpolateRow_Any_NEON; @@ -2119,44 +514,170 @@ void ScalePlaneBilinear(int src_width, int src_height, } } #endif - int dx = 0; - int dy = 0; + + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = ScaleFilterCols_C; +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + + const int max_y = (src_height - 1) << 16; + for (int j = 0; j < dst_height; ++j) { + if (y > max_y) { + y = max_y; + } + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + if (filtering == kFilterLinear) { + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); + } + dst_ptr += dst_stride; + y += dy; + } +} + +// Scale up down with bilinear interpolation. +SAFEBUFFERS +void ScalePlaneBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + assert(Abs(dst_width) <= kMaxStride); + + // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; - if (dst_width <= Abs(src_width)) { - dx = (Abs(src_width) << 16) / dst_width; - x = (dx >> 1) - 32768; - } else if (dst_width > 1) { - dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1); + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); + + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } } - // Negative src_width means horizontally mirror. - if (src_width < 0) { - x += (dst_width - 1) * dx; - dx = -dx; - src_width = -src_width; +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } } - if (dst_height <= src_height) { - dy = (src_height << 16) / dst_height; - y = (dy >> 1) - 32768; - } else if (dst_height > 1) { - dy = ((src_height - 1) << 16) / (dst_height - 1); +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } } - int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (filtering && TestCpuFlag(kCpuHasSSSE3)) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleFilterCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleFilterCols = ScaleColsUp2_SSE2; + } +#endif + } + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + SIMD_ALIGNED(uint8 row[2 * kMaxStride]); + uint8* rowptr = row; + int rowstride = kMaxStride; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + for (int j = 0; j < dst_height; ++j) { - if (y > maxy) { - y = maxy; + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_ptr + yi * src_stride; + } + if (yi != lasty) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); } - int yi = y >> 16; - int yf = (y >> 8) & 255; - const uint8* src = src_ptr + yi * src_stride; - InterpolateRow(row, src, src_stride, src_width, yf); - ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); dst_ptr += dst_stride; y += dy; } } -// Scale plane to/from any dimensions, without interpolation. +// Scale Plane to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. @@ -2165,74 +686,37 @@ static void ScalePlaneSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr) { - int dx = (Abs(src_width) << 16) / dst_width; - int dy = (src_height << 16) / dst_height; - int x = dx >> 1; - int y = dy >> 1; - // Negative src_width means horizontally mirror. - if (src_width < 0) { - x += (dst_width - 1) * dx; - dx = -dx; - src_width = -src_width; + // Initial source x/y coordinate and step values as 16.16 fixed point. + int x = 0; + int y = 0; + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, + &x, &y, &dx, &dy); + + void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = ScaleCols_C; + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleCols = ScaleColsUp2_C; +#if defined(HAS_SCALECOLS_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleCols = ScaleColsUp2_SSE2; + } +#endif } - for (int j = 0; j < dst_height; ++j) { - int xs = x; - int yi = y >> 16; - const uint8* src = src_ptr + yi * src_stride; - uint8* dst = dst_ptr; - for (int i = 0; i < dst_width; ++i) { - *dst++ = src[xs >> 16]; - xs += dx; - } + for (int i = 0; i < dst_height; ++i) { + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, + dst_width, x, dx); dst_ptr += dst_stride; y += dy; } } -// Scale plane to/from any dimensions. - -static void ScalePlaneAnySize(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterMode filtering) { - if (!filtering || src_width > kMaxStride) { - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else { - ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } -} - -// Scale plane down, any size -// -// This is an optimized version for scaling down a plane to any size. -// The current implementation is ~10 times faster compared to the -// reference implementation for e.g. XGA->LowResPAL - -static void ScalePlaneDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterMode filtering) { - if (!filtering || src_width > kMaxStride) { - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else if (filtering == kFilterBilinear || dst_height * 2 > src_height) { - // between 1/2x and 1x use bilinear - ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else { - ScalePlaneBox(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } -} - // Scale a plane. -// This function in turn calls a scaling function suitable for handling -// the desired resolutions. +// This function dispatches to a specialized scaler based on scale factor. LIBYUV_API void ScalePlane(const uint8* src, int src_stride, @@ -2240,53 +724,87 @@ void ScalePlane(const uint8* src, int src_stride, uint8* dst, int dst_stride, int dst_width, int dst_height, FilterMode filtering) { + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + + // Negative height means invert the image. + if (src_height < 0) { + src_height = -src_height; + src = src + (src_height - 1) * src_stride; + src_stride = -src_stride; + } + // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { // Straight copy. CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); - } else if (dst_width <= Abs(src_width) && dst_height <= src_height) { + return; + } + if (dst_width == src_width) { + int dy = FixedDiv(src_height, dst_height); + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, + dst_width, dst_height, + src_stride, dst_stride, src, dst, + 0, 0, dy, 1, filtering); + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. - if (use_reference_impl_) { - // For testing, allow the optimized versions to be disabled. - ScalePlaneDown(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); - } else if (4 * dst_width == 3 * src_width && - 4 * dst_height == 3 * src_height) { + if (4 * dst_width == 3 * src_width && + 4 * dst_height == 3 * src_height) { // optimized, 3/4 ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); + return; + } // 3/8 rounded up for odd sized chroma height. - } else if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { // optimized, 3/8 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (4 * dst_width == src_width && 4 * dst_height == src_height && + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && filtering != kFilterBilinear) { // optimized, 1/4 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else { - // Arbitrary downsample - ScalePlaneDown(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + return; } - } else { - // Arbitrary scale up and/or down. - ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); } + if (filtering == kFilterBox && src_width <= kMaxStride && + dst_height * 2 < src_height ) { + ScalePlaneBox(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); + return; + } + if (filtering && dst_height > src_height && dst_width <= kMaxStride) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if (filtering && src_width <= kMaxStride) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst); } // Scale an I420 image. // This function in turn calls a scaling function for each plane. -// TODO(fbarchard): Disable UNDER_ALLOCATED_HACK -#define UNDER_ALLOCATED_HACK 1 LIBYUV_API int I420Scale(const uint8* src_y, int src_stride_y, @@ -2299,47 +817,13 @@ int I420Scale(const uint8* src_y, int src_stride_y, int dst_width, int dst_height, FilterMode filtering) { if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 || - src_width > 32767 || src_height > 32767) { + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } - // Negative height means invert the image. - if (src_height < 0) { - src_height = -src_height; - int halfheight = Half(src_height); - src_y = src_y + (src_height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - int src_halfwidth = Half(src_width); - int src_halfheight = Half(src_height); - int dst_halfwidth = Half(dst_width); - int dst_halfheight = Half(dst_height); - -#ifdef UNDER_ALLOCATED_HACK - // If caller passed width / 2 for stride, adjust halfwidth to match. - if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) { - src_halfwidth = src_width >> 1; - } - if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) { - dst_halfwidth = dst_width >> 1; - } - // If caller used height / 2 when computing src_v, it will point into what - // should be the src_u plane. Detect this and reduce halfheight to match. - int uv_src_plane_size = src_halfwidth * src_halfheight; - if ((src_height & 1) && - (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { - src_halfheight = src_height >> 1; - } - int uv_dst_plane_size = dst_halfwidth * dst_halfheight; - if ((dst_height & 1) && - (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) { - dst_halfheight = dst_height >> 1; - } -#endif + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, @@ -2362,60 +846,15 @@ int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, int dst_stride_y, int dst_stride_u, int dst_stride_v, int dst_width, int dst_height, bool interpolate) { - if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || - !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0 || - src_width > 32767 || src_height > 32767) { - return -1; - } - // Negative height means invert the image. - if (src_height < 0) { - src_height = -src_height; - int halfheight = Half(src_height); - src_y = src_y + (src_height - 1) * src_stride_y; - src_u = src_u + (halfheight - 1) * src_stride_u; - src_v = src_v + (halfheight - 1) * src_stride_v; - src_stride_y = -src_stride_y; - src_stride_u = -src_stride_u; - src_stride_v = -src_stride_v; - } - int src_halfwidth = Half(src_width); - int src_halfheight = Half(src_height); - int dst_halfwidth = Half(dst_width); - int dst_halfheight = Half(dst_height); - FilterMode filtering = interpolate ? kFilterBox : kFilterNone; - -#ifdef UNDER_ALLOCATED_HACK - // If caller passed width / 2 for stride, adjust halfwidth to match. - if ((src_width & 1) && src_stride_u && src_halfwidth > Abs(src_stride_u)) { - src_halfwidth = src_width >> 1; - } - if ((dst_width & 1) && dst_stride_u && dst_halfwidth > Abs(dst_stride_u)) { - dst_halfwidth = dst_width >> 1; - } - // If caller used height / 2 when computing src_v, it will point into what - // should be the src_u plane. Detect this and reduce halfheight to match. - int uv_src_plane_size = src_halfwidth * src_halfheight; - if ((src_height & 1) && - (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { - src_halfheight = src_height >> 1; - } - int uv_dst_plane_size = dst_halfwidth * dst_halfheight; - if ((dst_height & 1) && - (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) { - dst_halfheight = dst_height >> 1; - } -#endif - - ScalePlane(src_y, src_stride_y, src_width, src_height, - dst_y, dst_stride_y, dst_width, dst_height, - filtering); - ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, - dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, - filtering); - ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, - dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, - filtering); - return 0; + return I420Scale(src_y, src_stride_y, + src_u, src_stride_u, + src_v, src_stride_v, + src_width, src_height, + dst_y, dst_stride_y, + dst_u, dst_stride_u, + dst_v, dst_stride_v, + dst_width, dst_height, + interpolate ? kFilterBox : kFilterNone); } // Deprecated api @@ -2425,15 +864,14 @@ int ScaleOffset(const uint8* src, int src_width, int src_height, bool interpolate) { if (!src || src_width <= 0 || src_height <= 0 || !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || - src_width > 32767 || src_height > 32767 || dst_yoffset >= dst_height) { return -1; } dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. - int src_halfwidth = Half(src_width); - int src_halfheight = Half(src_height); - int dst_halfwidth = Half(dst_width); - int dst_halfheight = Half(dst_height); + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); + int src_halfheight = SUBSAMPLE(src_height, 1, 1); + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int aheight = dst_height - dst_yoffset * 2; // actual output height const uint8* src_y = src; const uint8* src_u = src + src_width * src_height; @@ -2444,9 +882,15 @@ int ScaleOffset(const uint8* src, int src_width, int src_height, (dst_yoffset >> 1) * dst_halfwidth; uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + (dst_yoffset >> 1) * dst_halfwidth; - return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth, - src_width, src_height, dst_y, dst_u, dst_v, dst_width, - dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate); + return I420Scale(src_y, src_width, + src_u, src_halfwidth, + src_v, src_halfwidth, + src_width, src_height, + dst_y, dst_width, + dst_u, dst_halfwidth, + dst_v, dst_halfwidth, + dst_width, aheight, + interpolate ? kFilterBox : kFilterNone); } #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/scale_argb.cc b/chromium/third_party/libyuv/source/scale_argb.cc index 5cf14d949ef..fb2222e2a23 100644 --- a/chromium/third_party/libyuv/source/scale_argb.cc +++ b/chromium/third_party/libyuv/source/scale_argb.cc @@ -16,6 +16,7 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyARGB #include "libyuv/row.h" +#include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { @@ -26,715 +27,9 @@ static __inline int Abs(int v) { return v >= 0 ? v : -v; } -// ARGB scaling uses bilinear or point, but not box filter. -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) -#define HAS_SCALEARGBROWDOWNEVEN_NEON -#define HAS_SCALEARGBROWDOWN2_NEON -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -#endif - -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) -#define HAS_SCALEARGBROWDOWN2_SSE2 -// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - __asm { - mov eax, [esp + 4] // src_argb - // src_stride ignored - mov edx, [esp + 12] // dst_argb - mov ecx, [esp + 16] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - shufps xmm0, xmm1, 0xdd - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - ret - } -} - -// Blends 8x2 rectangle to 4x1. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_stride - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // dst_width - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - lea eax, [eax + 32] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop esi - ret - } -} - -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -// Reads 4 pixels at a time. -// Alignment requirement: dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - __asm { - push ebx - push edi - mov eax, [esp + 8 + 4] // src_argb - // src_stride ignored - mov ebx, [esp + 8 + 12] // src_stepx - mov edx, [esp + 8 + 16] // dst_argb - mov ecx, [esp + 8 + 20] // dst_width - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - align 16 - wloop: - movd xmm0, [eax] - movd xmm1, [eax + ebx] - punpckldq xmm0, xmm1 - movd xmm2, [eax + ebx * 2] - movd xmm3, [eax + edi] - lea eax, [eax + ebx * 4] - punpckldq xmm2, xmm3 - punpcklqdq xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop edi - pop ebx - ret - } -} - -// Blends four 2x2 to 4x1. -// Alignment requirement: dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - __asm { - push ebx - push esi - push edi - mov eax, [esp + 12 + 4] // src_argb - mov esi, [esp + 12 + 8] // src_stride - mov ebx, [esp + 12 + 12] // src_stepx - mov edx, [esp + 12 + 16] // dst_argb - mov ecx, [esp + 12 + 20] // dst_width - lea esi, [eax + esi] // row1 pointer - lea ebx, [ebx * 4] - lea edi, [ebx + ebx * 2] - - align 16 - wloop: - movq xmm0, qword ptr [eax] // row0 4 pairs - movhps xmm0, qword ptr [eax + ebx] - movq xmm1, qword ptr [eax + ebx * 2] - movhps xmm1, qword ptr [eax + edi] - lea eax, [eax + ebx * 4] - movq xmm2, qword ptr [esi] // row1 4 pairs - movhps xmm2, qword ptr [esi + ebx] - movq xmm3, qword ptr [esi + ebx * 2] - movhps xmm3, qword ptr [esi + edi] - lea esi, [esi + ebx * 4] - pavgb xmm0, xmm2 // average rows - pavgb xmm1, xmm3 - movdqa xmm2, xmm0 // average columns (8 to 4 pixels) - shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels - pavgb xmm0, xmm2 - sub ecx, 4 - movdqa [edx], xmm0 - lea edx, [edx + 16] - jg wloop - - pop edi - pop esi - pop ebx - ret - } -} - -// Column scaling unfiltered. SSSE3 version. -// TODO(fbarchard): Port to Neon - -#define HAS_SCALEARGBCOLS_SSE2 -__declspec(naked) __declspec(align(16)) -static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - align 16 - xloop2: - paddd xmm2, xmm3 // x += dx - movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels - movd xmm1, qword ptr [esi + edx * 4] // 1 source x1 pixels - punpckldq xmm0, xmm1 // x0 x1 - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - movd xmm0, qword ptr [esi + eax * 4] // 1 source x0 pixels - movd [edi], xmm0 - xloop99: - - pop edi - pop esi - ret - } -} - -// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. -// TODO(fbarchard): Port to Neon - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -static const uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -static const uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -__declspec(naked) __declspec(align(16)) -static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb - mov ecx, [esp + 8 + 12] // dst_width - movd xmm2, [esp + 8 + 16] // x - movd xmm3, [esp + 8 + 20] // dx - movdqa xmm4, kShuffleColARGB - movdqa xmm5, kShuffleFractions - pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. - psrlw xmm6, 9 - pextrw eax, xmm2, 1 // get x0 integer. preroll - sub ecx, 2 - jl xloop29 - - movdqa xmm0, xmm2 // x1 = x0 + dx - paddd xmm0, xmm3 - punpckldq xmm2, xmm0 // x0 x1 - punpckldq xmm3, xmm3 // dx dx - paddd xmm3, xmm3 // dx * 2, dx * 2 - pextrw edx, xmm2, 3 // get x1 integer. preroll - - // 2 Pixel loop. - align 16 - xloop2: - movdqa xmm1, xmm2 // x0, x1 fractions. - paddd xmm2, xmm3 // x += dx - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - psrlw xmm1, 9 // 7 bit fractions. - movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels - pshufb xmm1, xmm5 // 0000000011111111 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. - pextrw eax, xmm2, 1 // get x0 integer. next iteration. - pextrw edx, xmm2, 3 // get x1 integer. next iteration. - packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] - sub ecx, 2 // 2 pixels - jge xloop2 - xloop29: - - add ecx, 2 - 1 - jl xloop99 - - // 1 pixel remainder - psrlw xmm2, 9 // 7 bit fractions. - movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels - pshufb xmm2, xmm5 // 00000000 - pshufb xmm0, xmm4 // arrange pixels into pairs - pxor xmm2, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. - psrlw xmm0, 7 - packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. - movd [edi], xmm0 - xloop99: - - pop edi - pop esi - ret - } -} - -#elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) -// GCC versions of row functions are verbatim conversions from Visual C. -// Generated using gcc disassembly on Visual C object file: -// objdump -D yuvscaler.obj >yuvscaler.txt -#define HAS_SCALEARGBROWDOWN2_SSE2 -static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "shufps $0xdd,%%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - asm volatile ( - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%3,1),%%xmm2 \n" - "movdqa 0x10(%0,%3,1),%%xmm3 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(static_cast<intptr_t>(src_stride)) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -// Reads 4 pixels at a time. -// Alignment requirement: dst_argb 16 byte aligned. -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); - intptr_t src_stepx_x12 = 0; - asm volatile ( - "lea 0x0(,%1,4),%1 \n" - "lea (%1,%1,2),%4 \n" - ".p2align 4 \n" - "1: \n" - "movd (%0),%%xmm0 \n" - "movd (%0,%1,1),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "movd (%0,%1,2),%%xmm2 \n" - "movd (%0,%4,1),%%xmm3 \n" - "lea (%0,%1,4),%0 \n" - "punpckldq %%xmm3,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width), // %3 - "+r"(src_stepx_x12) // %4 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -// Blends four 2x2 to 4x1. -// Alignment requirement: dst_argb 16 byte aligned. -static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, int src_stepx, - uint8* dst_argb, int dst_width) { - intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); - intptr_t src_stepx_x12 = 0; - intptr_t row1 = static_cast<intptr_t>(src_stride); - asm volatile ( - "lea 0x0(,%1,4),%1 \n" - "lea (%1,%1,2),%4 \n" - "lea (%0,%5,1),%5 \n" - ".p2align 4 \n" - "1: \n" - "movq (%0),%%xmm0 \n" - "movhps (%0,%1,1),%%xmm0 \n" - "movq (%0,%1,2),%%xmm1 \n" - "movhps (%0,%4,1),%%xmm1 \n" - "lea (%0,%1,4),%0 \n" - "movq (%5),%%xmm2 \n" - "movhps (%5,%1,1),%%xmm2 \n" - "movq (%5,%1,2),%%xmm3 \n" - "movhps (%5,%4,1),%%xmm3 \n" - "lea (%5,%1,4),%5 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" - "shufps $0xdd,%%xmm1,%%xmm2 \n" - "pavgb %%xmm2,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqa %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stepx_x4), // %1 - "+r"(dst_argb), // %2 - "+rm"(dst_width), // %3 - "+r"(src_stepx_x12), // %4 - "+r"(row1) // %5 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -#define HAS_SCALEARGBCOLS_SSE2 -static void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0; - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - ".p2align 4 \n" - "2: \n" - "paddd %%xmm3,%%xmm2 \n" - "movd (%1,%3,4),%%xmm0 \n" - "movd (%1,%4,4),%%xmm1 \n" - "punpckldq %%xmm1,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" - - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "movd (%1,%3,4),%%xmm0 \n" - "movd %%xmm0,(%0) \n" - "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "+r"(x0), // %3 - "+r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -#ifdef __APPLE__ -#define CONST -#else -#define CONST static const -#endif - -// Shuffle table for arranging 2 pixels into pairs for pmaddubsw -CONST uvec8 kShuffleColARGB = { - 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel - 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel -}; - -// Shuffle table for duplicating 2 fractions into 8 bytes each -CONST uvec8 kShuffleFractions = { - 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, -}; - -// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - intptr_t x0 = 0, x1 = 0; - asm volatile ( - "movdqa %0,%%xmm4 \n" - "movdqa %1,%%xmm5 \n" - : - : "m"(kShuffleColARGB), // %0 - "m"(kShuffleFractions) // %1 - ); - - asm volatile ( - "movd %5,%%xmm2 \n" - "movd %6,%%xmm3 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x9,%%xmm6 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "sub $0x2,%2 \n" - "jl 29f \n" - "movdqa %%xmm2,%%xmm0 \n" - "paddd %%xmm3,%%xmm0 \n" - "punpckldq %%xmm0,%%xmm2 \n" - "punpckldq %%xmm3,%%xmm3 \n" - "paddd %%xmm3,%%xmm3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - - ".p2align 4 \n" - "2: \n" - "movdqa %%xmm2,%%xmm1 \n" - "paddd %%xmm3,%%xmm2 \n" - "movq (%1,%3,4),%%xmm0 \n" - "psrlw $0x9,%%xmm1 \n" - "movhps (%1,%4,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm1 \n" - "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "pextrw $0x1,%%xmm2,%k3 \n" - "pextrw $0x3,%%xmm2,%k4 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jge 2b \n" - - "29: \n" - "add $0x1,%2 \n" - "jl 99f \n" - "psrlw $0x9,%%xmm2 \n" - "movq (%1,%3,4),%%xmm0 \n" - "pshufb %%xmm5,%%xmm2 \n" - "pshufb %%xmm4,%%xmm0 \n" - "pxor %%xmm6,%%xmm2 \n" - "pmaddubsw %%xmm2,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%0) \n" - "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+rm"(dst_width), // %2 - "+r"(x0), // %3 - "+r"(x1) // %4 - : "rm"(x), // %5 - "rm"(dx) // %6 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" -#endif - ); -} -#endif // defined(__x86_64__) || defined(__i386__) - -static void ScaleARGBRowDown2_C(const uint8* src_argb, - ptrdiff_t /* src_stride */, - uint8* dst_argb, int dst_width) { - const uint32* src = reinterpret_cast<const uint32*>(src_argb); - uint32* dst = reinterpret_cast<uint32*>(dst_argb); - - for (int x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[1]; - dst[1] = src[3]; - src += 4; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[1]; - } -} - -static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, - uint8* dst_argb, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; - src_argb += 8; - dst_argb += 4; - } -} - -void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, - int src_stepx, - uint8* dst_argb, int dst_width) { - const uint32* src = reinterpret_cast<const uint32*>(src_argb); - uint32* dst = reinterpret_cast<uint32*>(dst_argb); - - for (int x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[0]; - dst[1] = src[src_stepx]; - src += src_stepx * 2; - dst += 2; - } - if (dst_width & 1) { - dst[0] = src[0]; - } -} - -static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - for (int x = 0; x < dst_width; ++x) { - dst_argb[0] = (src_argb[0] + src_argb[4] + - src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; - dst_argb[1] = (src_argb[1] + src_argb[5] + - src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; - dst_argb[2] = (src_argb[2] + src_argb[6] + - src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; - dst_argb[3] = (src_argb[3] + src_argb[7] + - src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; - src_argb += src_stepx * 4; - dst_argb += 4; - } -} - -// Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 -#define BLENDERC(a, b, f, s) static_cast<uint32>( \ - BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) -#define BLENDER(a, b, f) \ - BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ - BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) - -static void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = reinterpret_cast<const uint32*>(src_argb); - uint32* dst = reinterpret_cast<uint32*>(dst_argb); - for (int j = 0; j < dst_width - 1; j += 2) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - x += dx; - xi = x >> 16; - xf = (x >> 9) & 0x7f; - a = src[xi]; - b = src[xi + 1]; - dst[1] = BLENDER(a, b, xf); - x += dx; - dst += 2; - } - if (dst_width & 1) { - int xi = x >> 16; - int xf = (x >> 9) & 0x7f; - uint32 a = src[xi]; - uint32 b = src[xi + 1]; - dst[0] = BLENDER(a, b, xf); - } -} - // ScaleARGB ARGB, 1/2 // This is an optimized version for scaling down a ARGB to 1/2 of // its original size. - static void ScaleARGBDown2(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -743,8 +38,8 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, FilterMode filtering) { assert(dx == 65536 * 2); // Test scale factor of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. - // Advance to odd row / even column. - if (filtering) { + // Advance to odd row, even column. + if (filtering == kFilterBilinear) { src_argb += (y >> 16) * src_stride + (x >> 16) * 4; } else { src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; @@ -752,13 +47,16 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) = - filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C; + filtering == kFilterNone ? ScaleARGBRowDown2_C : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : + ScaleARGBRowDown2Box_C); #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 : - ScaleARGBRowDown2_SSE2; + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : + ScaleARGBRowDown2Box_SSE2); } #elif defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && @@ -768,7 +66,9 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, } #endif - // TODO(fbarchard): Loop through source height to allow odd height. + if (filtering == kFilterLinear) { + src_stride = 0; + } for (int y = 0; y < dst_height; ++y) { ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); src_argb += row_stride; @@ -776,6 +76,49 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, } } +// ScaleARGB ARGB, 1/4 +// This is an optimized version for scaling down a ARGB to 1/4 of +// its original size. +static void ScaleARGBDown4Box(int /* src_width */, int /* src_height */, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int dx, int y, int dy) { + assert(dx == 65536 * 4); // Test scale factor of 4. + assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. + + assert(dst_width * 2 <= kMaxStride); + // TODO(fbarchard): Remove clip_src_width alignment checks. + SIMD_ALIGNED(uint8 row[kMaxStride * 2 + 16]); + + // Advance to odd row, even column. + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; + int row_stride = src_stride * (dy >> 16); + void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; +#if defined(HAS_SCALEARGBROWDOWN2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + } +#elif defined(HAS_SCALEARGBROWDOWN2_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + } +#endif + + for (int y = 0; y < dst_height; ++y) { + ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); + ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, + row + kMaxStride, dst_width * 2); + ScaleARGBRowDown2(row, kMaxStride, dst_argb, dst_width); + src_argb += row_stride; + dst_argb += dst_stride; + } +} + // ScaleARGB ARGB Even // This is an optimized version for scaling down a ARGB to even // multiple of its original size. @@ -807,6 +150,9 @@ static void ScaleARGBDownEven(int src_width, int src_height, } #endif + if (filtering == kFilterLinear) { + src_stride = 0; + } for (int y = 0; y < dst_height; ++y) { ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); src_argb += row_stride; @@ -815,11 +161,13 @@ static void ScaleARGBDownEven(int src_width, int src_height, } // Scale ARGB down with bilinear interpolation. +SAFEBUFFERS static void ScaleARGBBilinearDown(int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { + int x, int dx, int y, int dy, + FilterMode filtering) { assert(src_height > 0); assert(dst_width > 0); assert(dst_height > 0); @@ -859,6 +207,14 @@ static void ScaleARGBBilinearDown(int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(clip_src_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) { InterpolateRow = InterpolateRow_Any_NEON; @@ -883,27 +239,33 @@ static void ScaleARGBBilinearDown(int src_height, ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif - int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + const int max_y = (src_height - 1) << 16; for (int j = 0; j < dst_height; ++j) { - if (y > maxy) { - y = maxy; + if (y > max_y) { + y = max_y; } int yi = y >> 16; - int yf = (y >> 8) & 255; const uint8* src = src_argb + yi * src_stride; - InterpolateRow(row, src, src_stride, clip_src_width, yf); - ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + if (filtering == kFilterLinear) { + ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(row, src, src_stride, clip_src_width, yf); + ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); + } dst_argb += dst_stride; y += dy; } } // Scale ARGB up with bilinear interpolation. +SAFEBUFFERS static void ScaleARGBBilinearUp(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy) { + int x, int dx, int y, int dy, + FilterMode filtering) { assert(src_width > 0); assert(src_height > 0); assert(dst_width > 0); @@ -934,6 +296,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } } #endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { InterpolateRow = InterpolateRow_Any_NEON; @@ -949,15 +319,32 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } #endif void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) = ScaleARGBFilterCols_C; + int dst_width, int x, int dx) = + filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { + if (filtering && TestCpuFlag(kCpuHasSSSE3)) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif - int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; - if (y > maxy) { - y = maxy; +#if defined(HAS_SCALEARGBCOLS_SSE2) + if (!filtering && TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBFilterCols = ScaleARGBCols_SSE2; + } +#endif + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBFilterCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; + } +#endif + } + + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; } int yi = y >> 16; const uint8* src = src_argb + yi * src_stride; @@ -976,7 +363,12 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, for (int j = 0; j < dst_height; ++j) { yi = y >> 16; if (yi != lasty) { - if (y <= maxy) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + src = src_argb + yi * src_stride; + } + if (yi != lasty) { ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; @@ -984,33 +376,205 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, src += src_stride; } } - int yf = (y >> 8) & 255; - InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } dst_argb += dst_stride; y += dy; } } -// Scales a single row of pixels using point sampling. -// Code is adapted from libyuv bilinear yuv scaling, but with bilinear -// interpolation off, and argb pixels instead of yuv. -static void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, - int dst_width, int x, int dx) { - const uint32* src = reinterpret_cast<const uint32*>(src_argb); - uint32* dst = reinterpret_cast<uint32*>(dst_argb); - for (int j = 0; j < dst_width - 1; j += 2) { - dst[0] = src[x >> 16]; - x += dx; - dst[1] = src[x >> 16]; - x += dx; - dst += 2; +#ifdef YUVSCALEUP +// Scale YUV to ARGB up with bilinear interpolation. +SAFEBUFFERS +static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride_y, + int src_stride_u, + int src_stride_v, + int dst_stride_argb, + const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int x, int dx, int y, int dy, + FilterMode filtering) { + assert(src_width > 0); + assert(src_height > 0); + assert(dst_width > 0); + assert(dst_height > 0); + assert(dst_width * 4 <= kMaxStride); + + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(src_width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(src_width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif + + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 8)) { + InterpolateRow = InterpolateRow_AVX2; + } } - if (dst_width & 1) { - dst[0] = src[x >> 16]; +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } +#endif + void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) = ScaleARGBFilterCols_C; +#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; + } +#endif + const int max_y = (src_height - 1) << 16; + if (y > max_y) { + y = max_y; + } + const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. + int yi = y >> 16; + int uv_yi = yi >> kYShift; + const uint8* src_row_y = src_y + yi * src_stride_y; + const uint8* src_row_u = src_u + uv_yi * src_stride_u; + const uint8* src_row_v = src_v + uv_yi * src_stride_v; + SIMD_ALIGNED(uint8 row[2 * kMaxStride]); + SIMD_ALIGNED(uint8 argb_row[kMaxStride * 4]); + uint8* rowptr = row; + int rowstride = kMaxStride; + int lasty = yi; + + ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); + if (src_height > 1) { + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); + if (src_height > 2) { + src_row_y += src_stride_y; + if (!(yi & 1)) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + + for (int j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y > max_y) { + y = max_y; + yi = y >> 16; + uv_yi = yi >> kYShift; + src_row_y = src_y + yi * src_stride_y; + src_row_u = src_u + uv_yi * src_stride_u; + src_row_v = src_v + uv_yi * src_stride_v; + } + if (yi != lasty) { + // TODO(fbarchard): Convert the clipped region of row. + I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); + ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src_row_y += src_stride_y; + if (yi & 1) { + src_row_u += src_stride_u; + src_row_v += src_stride_v; + } + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); + } + dst_argb += dst_stride_argb; + y += dy; } } +#endif -// ScaleARGB ARGB to/from any dimensions, without interpolation. +// Scale ARGB to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. @@ -1027,6 +591,16 @@ static void ScaleARGBSimple(int src_width, int src_height, ScaleARGBCols = ScaleARGBCols_SSE2; } #endif + if (src_width * 2 == dst_width && x < 0x8000) { + ScaleARGBCols = ScaleARGBColsUp2_C; +#if defined(HAS_SCALEARGBCOLSUP2_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBCols = ScaleARGBColsUp2_SSE2; + } +#endif + } for (int i = 0; i < dst_height; ++i) { ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, @@ -1036,33 +610,6 @@ static void ScaleARGBSimple(int src_width, int src_height, } } -// ScaleARGB ARGB to/from any dimensions. -static void ScaleARGBAnySize(int src_width, int src_height, - int dst_width, int dst_height, - int clip_width, int clip_height, - int src_stride, int dst_stride, - const uint8* src_argb, uint8* dst_argb, - int x, int dx, int y, int dy, - FilterMode filtering) { - if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) { - ScaleARGBBilinearUp(src_width, src_height, - clip_width, clip_height, - src_stride, dst_stride, src_argb, dst_argb, - x, dx, y, dy); - return; - } - if (filtering && src_width * 4 < kMaxStride) { - ScaleARGBBilinearDown(src_height, - clip_width, clip_height, - src_stride, dst_stride, src_argb, dst_argb, - x, dx, y, dy); - return; - } - ScaleARGBSimple(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src_argb, dst_argb, - x, dx, y, dy); -} - // ScaleARGB a ARGB. // This function in turn calls a scaling function // suitable for handling the desired resolutions. @@ -1072,6 +619,12 @@ static void ScaleARGB(const uint8* src, int src_stride, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, FilterMode filtering) { + // ARGB does not support box filter yet, but allow the user to pass it. + // Simplify filtering when possible. + filtering = ScaleFilterReduce(src_width, src_height, + dst_width, dst_height, + filtering); + // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; @@ -1079,37 +632,12 @@ static void ScaleARGB(const uint8* src, int src_stride, src_stride = -src_stride; } // Initial source x/y coordinate and step values as 16.16 fixed point. - int dx = 0; - int dy = 0; int x = 0; int y = 0; - if (filtering) { - // Scale step for bilinear sampling renders last pixel once for upsample. - if (dst_width <= Abs(src_width)) { - dx = (Abs(src_width) << 16) / dst_width; - x = (dx >> 1) - 32768; - } else if (dst_width > 1) { - dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1); - } - if (dst_height <= src_height) { - dy = (src_height << 16) / dst_height; - y = (dy >> 1) - 32768; - } else if (dst_height > 1) { - dy = ((src_height - 1) << 16) / (dst_height - 1); - } - } else { - // Scale step for point sampling duplicates all pixels equally. - dx = (Abs(src_width) << 16) / dst_width; - dy = (src_height << 16) / dst_height; - x = dx >> 1; - y = dy >> 1; - } - // Negative src_width means horizontally mirror. - if (src_width < 0) { - x += (dst_width - 1) * dx; - dx = -dx; - src_width = -src_width; - } + int dx = 0; + int dy = 0; + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, + &x, &y, &dx, &dy); if (clip_x) { x += clip_x * dx; dst += clip_x * 4; @@ -1121,19 +649,29 @@ static void ScaleARGB(const uint8* src, int src_stride, // Special case for integer step values. if (((dx | dy) & 0xffff) == 0) { - if (!dx || !dy) { + if (!dx || !dy) { // 1 pixel wide and/or tall. filtering = kFilterNone; } else { // Optimized even scale down. ie 2, 4, 6, 8, 10x. if (!(dx & 0x10000) && !(dy & 0x10000)) { - if ((dx >> 16) == 2) { - // Optimized 1/2 horizontal. - ScaleARGBDown2(src_width, src_height, clip_width, clip_height, + if (dx == 0x20000) { + // Optimized 1/2 downsample. + ScaleARGBDown2(src_width, src_height, + clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } - ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, + if (dx == 0x40000 && filtering == kFilterBox) { + // Optimized 1/4 box downsample. + ScaleARGBDown4Box(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); + return; + } + ScaleARGBDownEven(src_width, src_height, + clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; @@ -1141,7 +679,7 @@ static void ScaleARGB(const uint8* src, int src_stride, // Optimized odd scale down. ie 3, 5, 7, 9x. if ((dx & 0x10000) && (dy & 0x10000)) { filtering = kFilterNone; - if (dst_width == src_width && dst_height == src_height) { + if (dx == 0x10000 && dy == 0x10000) { // Straight copy. ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride, dst, dst_stride, clip_width, clip_height); @@ -1150,11 +688,31 @@ static void ScaleARGB(const uint8* src, int src_stride, } } } - // Arbitrary scale up and/or down. - ScaleARGBAnySize(src_width, src_height, - dst_width, dst_height, - clip_width, clip_height, - src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); + if (dx == 0x10000 && (x & 0xffff) == 0) { + // Arbitrary scale vertically, but unscaled vertically. + ScalePlaneVertical(src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, y, dy, 4, filtering); + return; + } + if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) { + ScaleARGBBilinearUp(src_width, src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + if (filtering && src_width * 4 < kMaxStride) { + ScaleARGBBilinearDown(src_height, + clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy, filtering); + return; + } + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst, + x, dx, y, dy); } LIBYUV_API @@ -1167,7 +725,6 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || - src_width > 32767 || src_height > 32767 || (clip_x + clip_width) > dst_width || (clip_y + clip_height) > dst_height) { return -1; @@ -1186,8 +743,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb, int dst_width, int dst_height, FilterMode filtering) { if (!src_argb || src_width == 0 || src_height == 0 || - !dst_argb || dst_width <= 0 || dst_height <= 0 || - src_width > 32767 || src_height > 32767) { + !dst_argb || dst_width <= 0 || dst_height <= 0) { return -1; } ScaleARGB(src_argb, src_stride_argb, src_width, src_height, diff --git a/chromium/third_party/libyuv/source/scale_argb_neon.cc b/chromium/third_party/libyuv/source/scale_argb_neon.cc deleted file mode 100644 index 51b00872441..00000000000 --- a/chromium/third_party/libyuv/source/scale_argb_neon.cc +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright 2012 The LibYuv Project Authors. All rights reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "libyuv/basic_types.h" -#include "libyuv/row.h" - -#ifdef __cplusplus -namespace libyuv { -extern "C" { -#endif - -// This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) - -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - "vld2.32 {q0, q1}, [%0]! \n" - "vld2.32 {q2, q3}, [%0]! \n" - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {q1}, [%1]! \n" // store odd pixels - "vst1.8 {q3}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List - ); -} - -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. - "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. - "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. - "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. - "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. - "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. - "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. - "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. - "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. - "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. - "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. - "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2 \n" - "vrshrn.u16 d2, q2, #2 \n" - "vrshrn.u16 d3, q3, #2 \n" - "vst4.8 {d0, d1, d2, d3}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" - ); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %3, lsl #2 \n" - ".p2align 2 \n" - "1: \n" - "vld1.32 {d0[0]}, [%0], r12 \n" - "vld1.32 {d0[1]}, [%0], r12 \n" - "vld1.32 {d1[0]}, [%0], r12 \n" - "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"(src_stepx) // %3 - : "memory", "cc", "r12", "q0" - ); -} - -// Reads 4 pixels at a time. -// Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { - asm volatile ( - "mov r12, %4, lsl #2 \n" - "add %1, %1, %0 \n" - ".p2align 2 \n" - "1: \n" - "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 - "vld1.8 {d1}, [%1], r12 \n" - "vld1.8 {d2}, [%0], r12 \n" - "vld1.8 {d3}, [%1], r12 \n" - "vld1.8 {d4}, [%0], r12 \n" - "vld1.8 {d5}, [%1], r12 \n" - "vld1.8 {d6}, [%0], r12 \n" - "vld1.8 {d7}, [%1], r12 \n" - "vaddl.u8 q0, d0, d1 \n" - "vaddl.u8 q1, d2, d3 \n" - "vaddl.u8 q2, d4, d5 \n" - "vaddl.u8 q3, d6, d7 \n" - "vswp.8 d1, d2 \n" // ab_cd -> ac_bd - "vswp.8 d5, d6 \n" // ef_gh -> eg_fh - "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) - "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) - "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. - "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. - "vst1.8 {q0}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"(src_stepx) // %4 - : "memory", "cc", "r12", "q0", "q1", "q2", "q3" - ); -} -#endif // __ARM_NEON__ - -#ifdef __cplusplus -} // extern "C" -} // namespace libyuv -#endif diff --git a/chromium/third_party/libyuv/source/scale_common.cc b/chromium/third_party/libyuv/source/scale_common.cc new file mode 100644 index 00000000000..ee6a336292c --- /dev/null +++ b/chromium/third_party/libyuv/source/scale_common.cc @@ -0,0 +1,657 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" + +#include <assert.h> +#include <string.h> + +#include "libyuv/cpu_id.h" +#include "libyuv/planar_functions.h" // For CopyARGB +#include "libyuv/row.h" +#include "libyuv/scale_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + +// CPU agnostic row functions +void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; + dst += 2; + src_ptr += 4; + } + if (dst_width & 1) { + dst[0] = src_ptr[1]; + } +} + +void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + 1) >> 1; + dst[1] = (s[2] + s[3] + 1) >> 1; + dst += 2; + s += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + 1) >> 1; + } +} + +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; + dst += 2; + s += 4; + t += 4; + } + if (dst_width & 1) { + dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; + } +} + +void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = src_ptr[2]; + } +} + +void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + + src_ptr[stride + 4] + src_ptr[stride + 5] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + + 8) >> 4; + dst += 2; + src_ptr += 8; + } + if (dst_width & 1) { + dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride + 3] + + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + + 8) >> 4; + } +} + +void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + for (int x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[1]; + dst[2] = src_ptr[3]; + dst += 3; + src_ptr += 4; + } +} + +// Filter rows 0 and 1 together, 3 : 1 +void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + for (int x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 * 3 + b0 + 2) >> 2; + d[1] = (a1 * 3 + b1 + 2) >> 2; + d[2] = (a2 * 3 + b2 + 2) >> 2; + d += 3; + s += 4; + t += 4; + } +} + +// Filter rows 1 and 2 together, 1 : 1 +void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + const uint8* s = src_ptr; + const uint8* t = src_ptr + src_stride; + for (int x = 0; x < dst_width; x += 3) { + uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; + uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; + uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; + uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; + uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; + uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; + d[0] = (a0 + b0 + 1) >> 1; + d[1] = (a1 + b1 + 1) >> 1; + d[2] = (a2 + b2 + 1) >> 1; + d += 3; + s += 4; + t += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + for (int j = 0; j < dst_width - 1; j += 2) { + dst_ptr[0] = src_ptr[x >> 16]; + x += dx; + dst_ptr[1] = src_ptr[x >> 16]; + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int, int) { + for (int j = 0; j < dst_width - 1; j += 2) { + dst_ptr[1] = dst_ptr[0] = src_ptr[0]; + src_ptr += 1; + dst_ptr += 2; + } + if (dst_width & 1) { + dst_ptr[0] = src_ptr[0]; + } +} + +// (1-f)a + fb can be replaced with a + f(b-a) +#define BLENDER(a, b, f) (static_cast<int>(a) + \ + ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16)) + +void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + for (int j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + x += dx; + xi = x >> 16; + a = src_ptr[xi]; + b = src_ptr[xi + 1]; + dst_ptr[1] = BLENDER(a, b, x & 0xffff); + x += dx; + dst_ptr += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int a = src_ptr[xi]; + int b = src_ptr[xi + 1]; + dst_ptr[0] = BLENDER(a, b, x & 0xffff); + } +} +#undef BLENDER + +void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + assert(dst_width % 3 == 0); + for (int x = 0; x < dst_width; x += 3) { + dst[0] = src_ptr[0]; + dst[1] = src_ptr[3]; + dst[2] = src_ptr[6]; + dst += 3; + src_ptr += 8; + } +} + +// 8x3 -> 3x1 +void ScaleRowDown38_3_Box_C(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + intptr_t stride = src_stride; + for (int i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * + (65536 / 9) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * + (65536 / 9) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7] + + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * + (65536 / 6) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +// 8x2 -> 3x1 +void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + assert((dst_width % 3 == 0) && (dst_width > 0)); + intptr_t stride = src_stride; + for (int i = 0; i < dst_width; i += 3) { + dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + + src_ptr[stride + 0] + src_ptr[stride + 1] + + src_ptr[stride + 2]) * (65536 / 6) >> 16; + dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + + src_ptr[stride + 3] + src_ptr[stride + 4] + + src_ptr[stride + 5]) * (65536 / 6) >> 16; + dst_ptr[2] = (src_ptr[6] + src_ptr[7] + + src_ptr[stride + 6] + src_ptr[stride + 7]) * + (65536 / 4) >> 16; + src_ptr += 8; + dst_ptr += 3; + } +} + +void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + assert(src_width > 0); + assert(src_height > 0); + for (int x = 0; x < src_width; ++x) { + const uint8* s = src_ptr + x; + unsigned int sum = 0u; + for (int y = 0; y < src_height; ++y) { + sum += s[0]; + s += src_stride; + } + // TODO(fbarchard): Consider limitting height to 256 to avoid overflow. + dst_ptr[x] = sum < 65535u ? sum : 65535u; + } +} + +void ScaleARGBRowDown2_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + const uint32* src = reinterpret_cast<const uint32*>(src_argb); + uint32* dst = reinterpret_cast<uint32*>(dst_argb); + + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[1]; + dst[1] = src[3]; + src += 4; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[1]; + } +} + +void ScaleARGBRowDown2Linear_C(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; + dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; + dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; + dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += 8; + dst_argb += 4; + } +} + +void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, + int src_stepx, + uint8* dst_argb, int dst_width) { + const uint32* src = reinterpret_cast<const uint32*>(src_argb); + uint32* dst = reinterpret_cast<uint32*>(dst_argb); + + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_argb[0] = (src_argb[0] + src_argb[4] + + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; + dst_argb[1] = (src_argb[1] + src_argb[5] + + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; + dst_argb[2] = (src_argb[2] + src_argb[6] + + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; + dst_argb[3] = (src_argb[3] + src_argb[7] + + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; + src_argb += src_stepx * 4; + dst_argb += 4; + } +} + +// Scales a single row of pixels using point sampling. +void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = reinterpret_cast<const uint32*>(src_argb); + uint32* dst = reinterpret_cast<uint32*>(dst_argb); + for (int j = 0; j < dst_width - 1; j += 2) { + dst[0] = src[x >> 16]; + x += dx; + dst[1] = src[x >> 16]; + x += dx; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[x >> 16]; + } +} + +// Scales a single row of pixels up by 2x using point sampling. +void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int, int) { + const uint32* src = reinterpret_cast<const uint32*>(src_argb); + uint32* dst = reinterpret_cast<uint32*>(dst_argb); + for (int j = 0; j < dst_width - 1; j += 2) { + dst[1] = dst[0] = src[0]; + src += 1; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +// Mimics SSSE3 blender +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 +#define BLENDERC(a, b, f, s) static_cast<uint32>( \ + BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) +#define BLENDER(a, b, f) \ + BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ + BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) + +void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint32* src = reinterpret_cast<const uint32*>(src_argb); + uint32* dst = reinterpret_cast<uint32*>(dst_argb); + for (int j = 0; j < dst_width - 1; j += 2) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + x += dx; + xi = x >> 16; + xf = (x >> 9) & 0x7f; + a = src[xi]; + b = src[xi + 1]; + dst[1] = BLENDER(a, b, xf); + x += dx; + dst += 2; + } + if (dst_width & 1) { + int xi = x >> 16; + int xf = (x >> 9) & 0x7f; + uint32 a = src[xi]; + uint32 b = src[xi + 1]; + dst[0] = BLENDER(a, b, xf); + } +} +#undef BLENDER1 +#undef BLENDERC +#undef BLENDER + +// Scale plane vertically with bilinear interpolation. +void ScalePlaneVertical(int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_argb, uint8* dst_argb, + int x, int y, int dy, + int bpp, FilterMode filtering) { + // TODO(fbarchard): Allow higher bpp. + assert(bpp >= 1 && bpp <= 4); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + int dst_width_bytes = dst_width * bpp; + src_argb += (x >> 16) * bpp; + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width_bytes, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width_bytes, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width_bytes, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + for (int j = 0; j < dst_height; ++j) { + if (y > max_y) { + y = max_y; + } + int yi = y >> 16; + int yf = filtering ? ((y >> 8) & 255) : 0; + const uint8* src = src_argb + yi * src_stride; + InterpolateRow(dst_argb, src, src_stride, dst_width_bytes, yf); + dst_argb += dst_stride; + y += dy; + } +} + +// Simplify the filtering based on scale factors. +FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + FilterMode filtering) { + if (src_width < 0) { + src_width = -src_width; + } + if (src_height < 0) { + src_height = -src_height; + } + if (filtering == kFilterBox) { + // If scaling both axis to 0.5 or larger, switch from Box to Bilinear. + if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { + filtering = kFilterBilinear; + } + // If scaling to larger, switch from Box to Bilinear. + if (dst_width >= src_width || dst_height >= src_height) { + filtering = kFilterBilinear; + } + } + if (filtering == kFilterBilinear) { + if (src_height == 1) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. + if (dst_height == src_height || dst_height * 3 == src_height) { + filtering = kFilterLinear; + } + // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to + // avoid reading 2 pixels horizontally that causes memory exception. + if (src_width == 1) { + filtering = kFilterNone; + } + } + if (filtering == kFilterLinear) { + if (src_width == 1) { + filtering = kFilterNone; + } + // TODO(fbarchard): Detect any odd scale factor and reduce to None. + if (dst_width == src_width || dst_width * 3 == src_width) { + filtering = kFilterNone; + } + } + return filtering; +} + +#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) +#define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \ + (dst << 16) - 0x00010000); + +// Compute slope values for stepping. +void ScaleSlope(int src_width, int src_height, + int dst_width, int dst_height, + FilterMode filtering, + int* x, int* y, int* dx, int* dy) { + assert(x != NULL); + assert(y != NULL); + assert(dx != NULL); + assert(dy != NULL); + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + if (filtering == kFilterBox) { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = 0; + *y = 0; + } else if (filtering == kFilterBilinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FIXEDDIV1(Abs(src_width), dst_width); + *x = 0; + } + if (dst_height <= src_height) { + *dy = FixedDiv(src_height, dst_height); + *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_height > 1) { + *dy = FIXEDDIV1(src_height, dst_height); + *y = 0; + } + } else if (filtering == kFilterLinear) { + // Scale step for bilinear sampling renders last pixel once for upsample. + if (dst_width <= Abs(src_width)) { + *dx = FixedDiv(Abs(src_width), dst_width); + *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. + } else if (dst_width > 1) { + *dx = FIXEDDIV1(Abs(src_width), dst_width); + *x = 0; + } + *dy = FixedDiv(src_height, dst_height); + *y = *dy >> 1; + } else { + // Scale step for point sampling duplicates all pixels equally. + *dx = FixedDiv(Abs(src_width), dst_width); + *dy = FixedDiv(src_height, dst_height); + *x = CENTERSTART(*dx, 0); + *y = CENTERSTART(*dy, 0); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + *x += (dst_width - 1) * *dx; + *dx = -*dx; + src_width = -src_width; + } +} +#undef CENTERSTART +#undef FIXEDDIV1 + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/chromium/third_party/libyuv/source/scale_mips.cc b/chromium/third_party/libyuv/source/scale_mips.cc index cfd48b5b053..de94560959e 100644 --- a/chromium/third_party/libyuv/source/scale_mips.cc +++ b/chromium/third_party/libyuv/source/scale_mips.cc @@ -30,6 +30,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, "beqz $t9, 2f \n" " nop \n" + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -88,6 +89,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "bltz $t9, 2f \n" " nop \n" + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -176,7 +178,7 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, } void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { + uint8* dst, int dst_width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -185,6 +187,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, "beqz $t9, 2f \n" " nop \n" + ".p2align 2 \n" "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -231,7 +234,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, } void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { + uint8* dst, int dst_width) { intptr_t stride = src_stride; const uint8* s1 = src_ptr + stride; const uint8* s2 = s1 + stride; @@ -244,6 +247,7 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "srl $t9, %[dst_width], 1 \n" "andi $t8, %[dst_width], 1 \n" + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[s1]) \n" // |7|6|5|4| @@ -314,6 +318,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" + ".p2align 2 \n" "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -360,7 +365,9 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" - "repl.ph $t3, 3 \n" // 0x00030003 + "repl.ph $t3, 3 \n" // 0x00030003 + + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| @@ -416,6 +423,8 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ".set push \n" ".set noreorder \n" "repl.ph $t2, 3 \n" // 0x00030003 + + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| @@ -466,6 +475,8 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" + + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -515,6 +526,8 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" + + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| @@ -571,6 +584,8 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" + + ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| diff --git a/chromium/third_party/libyuv/source/scale_neon.cc b/chromium/third_party/libyuv/source/scale_neon.cc index a370349a72f..c9c6b2cdf88 100644 --- a/chromium/third_party/libyuv/source/scale_neon.cc +++ b/chromium/third_party/libyuv/source/scale_neon.cc @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/basic_types.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -16,7 +15,7 @@ namespace libyuv { extern "C" { #endif -// This module is for GCC Neon +// This module is for GCC Neon. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) // NEON downscalers with interpolation. @@ -25,6 +24,7 @@ extern "C" { void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { asm volatile ( + ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 "vld2.8 {q0, q1}, [%0]! \n" @@ -44,6 +44,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( // change the stride to row 2 pointer "add %1, %0 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc @@ -68,11 +69,12 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width) { asm volatile ( + ".p2align 2 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.8 {d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -87,6 +89,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add r4, %0, %3 \n" "add r5, r4, %3 \n" "add %3, r5, %3 \n" + ".p2align 2 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q1}, [r4]! \n" @@ -117,12 +120,13 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width) { asm volatile ( + ".p2align 2 \n" "1: \n" - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, %2, #24 \n" - "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.8 {d0, d1, d2}, [%1]! \n" - "bgt 1b \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #24 \n" + "vmov d2, d3 \n" // order d0, d1, d2 + "vst3.8 {d0, d1, d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -135,8 +139,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 @@ -191,8 +196,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vmov.u8 d24, #3 \n" - "add %3, %0 \n" + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 @@ -226,14 +232,14 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, } #define HAS_SCALEROWDOWN38_NEON -const uvec8 kShuf38 = +static uvec8 kShuf38 = { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; -const uvec8 kShuf38_2 = +static uvec8 kShuf38_2 = { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; -const vec16 kMult38_Div6 = +static vec16 kMult38_Div6 = { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; -const vec16 kMult38_Div9 = +static vec16 kMult38_Div9 = { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; @@ -242,15 +248,16 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width) { asm volatile ( - "vld1.8 {q3}, [%3] \n" + "vld1.8 {q3}, [%3] \n" + ".p2align 2 \n" "1: \n" - "vld1.8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, %2, #12 \n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.8 {d4}, [%1]! \n" - "vst1.32 {d5[0]}, [%1]! \n" - "bgt 1b \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" + "subs %2, %2, #12 \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -264,11 +271,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "vld1.8 {q15}, [%6] \n" - "add r4, %0, %3, lsl #1 \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "vld1.8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + ".p2align 2 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 @@ -374,9 +382,10 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vld1.16 {q13}, [%4] \n" - "vld1.8 {q14}, [%5] \n" - "add %3, %0 \n" + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "add %3, %0 \n" + ".p2align 2 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 @@ -546,6 +555,125 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" ); } + +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + // load even pixels into q0, odd into q1 + "vld2.32 {q0, q1}, [%0]! \n" + "vld2.32 {q2, q3}, [%0]! \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "vst1.8 {q3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vrshrn.u16 d2, q2, #2 \n" + "vrshrn.u16 d3, q3, #2 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %3, lsl #2 \n" + ".p2align 2 \n" + "1: \n" + "vld1.32 {d0[0]}, [%0], r12 \n" + "vld1.32 {d0[1]}, [%0], r12 \n" + "vld1.32 {d1[0]}, [%0], r12 \n" + "vld1.32 {d1[1]}, [%0], r12 \n" + "subs %2, %2, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(src_stepx) // %3 + : "memory", "cc", "r12", "q0" + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: src_argb 4 byte aligned. +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + asm volatile ( + "mov r12, %4, lsl #2 \n" + "add %1, %1, %0 \n" + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 + "vld1.8 {d1}, [%1], r12 \n" + "vld1.8 {d2}, [%0], r12 \n" + "vld1.8 {d3}, [%1], r12 \n" + "vld1.8 {d4}, [%0], r12 \n" + "vld1.8 {d5}, [%1], r12 \n" + "vld1.8 {d6}, [%0], r12 \n" + "vld1.8 {d7}, [%1], r12 \n" + "vaddl.u8 q0, d0, d1 \n" + "vaddl.u8 q1, d2, d3 \n" + "vaddl.u8 q2, d4, d5 \n" + "vaddl.u8 q3, d6, d7 \n" + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. + "subs %3, %3, #4 \n" // 4 pixels per loop. + "vst1.8 {q0}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"(src_stepx) // %4 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" + ); +} + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/chromium/third_party/libyuv/source/scale_posix.cc b/chromium/third_party/libyuv/source/scale_posix.cc new file mode 100644 index 00000000000..a777bfde1cc --- /dev/null +++ b/chromium/third_party/libyuv/source/scale_posix.cc @@ -0,0 +1,1337 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// TODO(nfullagar): For Native Client: When new toolchain becomes available, +// take advantage of bundle lock / unlock feature. This will reduce the amount +// of manual bundle alignment done below, and bundle alignment could even be +// moved into each macro that doesn't use %%nacl: such as MEMOPREG. + +#if defined(__native_client__) && defined(__x86_64__) +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%q" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%q" #base ",%q" #index "," #scale ")" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14),%%" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " %%" #reg ",(%%r15,%%r14)\n" +#define MEMOP(opcode, offset, base, index, scale) \ + "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \ + #opcode " (%%r15,%%r14)" +#define BUNDLEALIGN ".p2align 5\n" +#else +#define MEMACCESS(base) "(%" #base ")" +#define MEMACCESS2(offset, base) #offset "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" +#define MEMLEA3(offset, index, scale) \ + #offset "(,%" #index "," #scale ")" +#define MEMLEA4(offset, base, index, scale) \ + #offset "(%" #base ",%" #index "," #scale ")" +#define MEMOPREG(opcode, offset, base, index, scale, reg) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n" +#define MEMOPMEM(opcode, reg, offset, base, index, scale) \ + #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n" +#define MEMOP(opcode, offset, base, index, scale) \ + #opcode " " #offset "(%" #base ",%" #index "," #scale ")" +#define BUNDLEALIGN +#endif + +// GCC versions of row functions are verbatim conversions from Visual C. +// Generated using gcc disassembly on Visual C object file: +// objdump -D yuvscaler.obj >yuvscaler.txt + +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqu " MEMACCESS(0) ",%%xmm0 \n" + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} + +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stridex3 = 0; + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 + MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 + MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x8,1) ",%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(stridex3) // %3 + : "r"(static_cast<intptr_t>(src_stride)) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" +#endif + ); +} + +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm3 \n" + "movdqa %1,%%xmm4 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kShuf0), // %0 + "m"(kShuf1), // %1 + "m"(kShuf2) // %2 + ); + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movq %%xmm1," MEMACCESS2(0x8,1) " \n" + "movq %%xmm2," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" // kShuf01 + "movdqa %1,%%xmm3 \n" // kShuf11 + "movdqa %2,%%xmm4 \n" // kShuf21 + : + : "m"(kShuf01), // %0 + "m"(kShuf11), // %1 + "m"(kShuf21) // %2 + ); + asm volatile ( + "movdqa %0,%%xmm5 \n" // kMadd01 + "movdqa %1,%%xmm0 \n" // kMadd11 + "movdqa %2,%%xmm1 \n" // kRound34 + : + : "m"(kMadd01), // %0 + "m"(kMadd11), // %1 + "m"(kRound34) // %2 + ); + + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS(1) " \n" + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm0,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "pshufb %%xmm4,%%xmm6 \n" + "pmaddubsw %4,%%xmm6 \n" + "paddsw %%xmm1,%%xmm6 \n" + "psrlw $0x2,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x18,1) ",%1 \n" + "sub $0x18,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)), // %3 + "m"(kMadd21) // %4 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(1) " \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x8,1) " \n" + "lea " MEMLEA(0xc,1) ",%1 \n" + "sub $0xc,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kShuf38a), // %3 + "m"(kShuf38b) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm4", "xmm5" +#endif + ); +} + +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "movdqa %3,%%xmm5 \n" + : + : "m"(kShufAb0), // %0 + "m"(kShufAb1), // %1 + "m"(kShufAb2), // %2 + "m"(kScaleAb2) // %3 + ); + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "pshufb %%xmm3,%%xmm6 \n" + "paddusw %%xmm6,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "paddusw %%xmm0,%%xmm1 \n" + "pmulhuw %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "sub $0x6,%2 \n" + "movd %%xmm1," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm1 \n" + "movd %%xmm1," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "movdqa %0,%%xmm2 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + : + : "m"(kShufAc), // %0 + "m"(kShufAc3), // %1 + "m"(kScaleAc33) // %2 + ); + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 + "lea " MEMLEA(0x10,0) ",%0 \n" + "movhlps %%xmm6,%%xmm7 \n" + "punpcklbw %%xmm5,%%xmm6 \n" + "punpcklbw %%xmm5,%%xmm7 \n" + "paddusw %%xmm6,%%xmm0 \n" + "paddusw %%xmm7,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm6 \n" + "pshufb %%xmm2,%%xmm6 \n" + "movdqa %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm7 \n" + "pshufb %%xmm3,%%xmm7 \n" + "paddusw %%xmm7,%%xmm6 \n" + "pmulhuw %%xmm4,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "sub $0x6,%2 \n" + "movd %%xmm6," MEMACCESS(1) " \n" + "psrlq $0x10,%%xmm6 \n" + "movd %%xmm6," MEMACCESS2(0x2,1) " \n" + "lea " MEMLEA(0x6,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} + +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int tmp_height = 0; + intptr_t tmp_src = 0; + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "sub $0x1,%5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "mov %0,%3 \n" + "add %6,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "mov %5,%2 \n" + "test %2,%2 \n" + "je 3f \n" + ".p2align 2 \n" + BUNDLEALIGN + "2: \n" + "movdqa " MEMACCESS(0) ",%%xmm2 \n" + "add %6,%0 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x1,%2 \n" + "jg 2b \n" + ".p2align 2 \n" + "3: \n" + BUNDLEALIGN + "movdqa %%xmm0," MEMACCESS(1) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" + "lea " MEMLEA(0x10,3) ",%0 \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_height), // %2 + "+r"(tmp_src), // %3 + "+r"(src_width), // %4 + "+rm"(src_height) // %5 + : "rm"(static_cast<intptr_t>(src_stride)) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Bilinear column filtering. SSSE3 version. +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0, temp_pixel = 0; + asm volatile ( + "movd %6,%%xmm2 \n" + "movd %7,%%xmm3 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "subl $0x2,%5 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + ".p2align 2 \n" + BUNDLEALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOP(movzwl,0x00,1,3,1) ",%k2 \n" // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOP(movzwl,0x00,1,4,1) ",%k2 \n" // movzwl (%1,%4,1),%k2 + "movd %k2,%%xmm4 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpcklwd %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %w2," MEMACCESS(0) " \n" + "lea " MEMLEA(0x2,0) ",%0 \n" + "sub $0x2,%5 \n" + "jge 2b \n" + ".p2align 2 \n" + BUNDLEALIGN + "29: \n" + "addl $0x1,%5 \n" + "jl 99f \n" + MEMOP(movzwl,0x00,1,3,1) ",%k2 \n" // movzwl (%1,%3,1),%k2 + "movd %k2,%%xmm0 \n" + "psrlw $0x9,%%xmm2 \n" + "pshufb %%xmm5,%%xmm2 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%k2 \n" + "mov %b2," MEMACCESS(0) " \n" + "99: \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+a"(temp_pixel), // %2 + "+r"(x0), // %3 + "+r"(x1), // %4 + "+rm"(dst_width) // %5 + : "rm"(x), // %6 + "rm"(dx) // %7 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int /* x */, int /* dx */) { + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "sub $0x20,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(0) ",%%xmm0 \n" + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 + "lea " MEMLEA(0x20,0) ",%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast<intptr_t>(src_stride)) // %3 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); + intptr_t src_stepx_x12 = 0; + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movd " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 + "punpckldq %%xmm1,%%xmm0 \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 + MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width), // %3 + "+r"(src_stepx_x12) // %4 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, int src_stepx, + uint8* dst_argb, int dst_width) { + intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); + intptr_t src_stepx_x12 = 0; + intptr_t row1 = static_cast<intptr_t>(src_stride); + asm volatile ( + "lea " MEMLEA3(0x00,1,4) ",%1 \n" + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" + "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movq " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 + MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 + BUNDLEALIGN + MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" + "movq " MEMACCESS(5) ",%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 + MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 + MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 + "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_argb), // %2 + "+rm"(dst_width), // %3 + "+r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x11,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x5,%%xmm3,%%xmm0 \n" + "paddd %%xmm0,%%xmm2 \n" + "paddd %%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "cmp $0x0,%4 \n" + "jl 99f \n" + "sub $0x4,%4 \n" + "jl 49f \n" + ".p2align 2 \n" + BUNDLEALIGN + "40: \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "pextrw $0x7,%%xmm2,%k1 \n" + "paddd %%xmm3,%%xmm2 \n" + "punpckldq %%xmm1,%%xmm0 \n" + MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 + MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 + "pextrw $0x1,%%xmm2,%k0 \n" + "pextrw $0x3,%%xmm2,%k1 \n" + "punpckldq %%xmm4,%%xmm1 \n" + "punpcklqdq %%xmm1,%%xmm0 \n" + "sub $0x4,%4 \n" + "movdqu %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x10,2) ",%2 \n" + "jge 40b \n" + + "49: \n" + "test $0x2,%4 \n" + "je 29f \n" + BUNDLEALIGN + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 + "pextrw $0x5,%%xmm2,%k0 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(2) " \n" + "lea " MEMLEA(0x8,2) ",%2 \n" + "29: \n" + "test $0x1,%4 \n" + "je 99f \n" + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 + "movd %%xmm0," MEMACCESS(2) " \n" + "99: \n" + : "+a"(x0), // %0 + "+d"(x1), // %1 + "+r"(dst_argb), // %2 + "+r"(src_argb), // %3 + "+r"(dst_width) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int /* x */, int /* dx */) { + asm volatile ( + ".p2align 2 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + intptr_t x0 = 0, x1 = 0; + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm5 \n" + : + : "m"(kShuffleColARGB), // %0 + "m"(kShuffleFractions) // %1 + ); + + asm volatile ( + "movd %5,%%xmm2 \n" + "movd %6,%%xmm3 \n" + "pcmpeqb %%xmm6,%%xmm6 \n" + "psrlw $0x9,%%xmm6 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "sub $0x2,%2 \n" + "jl 29f \n" + "movdqa %%xmm2,%%xmm0 \n" + "paddd %%xmm3,%%xmm0 \n" + "punpckldq %%xmm0,%%xmm2 \n" + "punpckldq %%xmm3,%%xmm3 \n" + "paddd %%xmm3,%%xmm3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + + ".p2align 2 \n" + BUNDLEALIGN + "2: \n" + "movdqa %%xmm2,%%xmm1 \n" + "paddd %%xmm3,%%xmm2 \n" + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "psrlw $0x9,%%xmm1 \n" + BUNDLEALIGN + MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 + "pshufb %%xmm5,%%xmm1 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm1 \n" + "pmaddubsw %%xmm1,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "pextrw $0x1,%%xmm2,%k3 \n" + "pextrw $0x3,%%xmm2,%k4 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0," MEMACCESS(0) " \n" + "lea " MEMLEA(0x8,0) ",%0 \n" + "sub $0x2,%2 \n" + "jge 2b \n" + + ".p2align 2 \n" + BUNDLEALIGN + "29: \n" + "add $0x1,%2 \n" + "jl 99f \n" + "psrlw $0x9,%%xmm2 \n" + BUNDLEALIGN + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 + "pshufb %%xmm5,%%xmm2 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pxor %%xmm6,%%xmm2 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0," MEMACCESS(0) " \n" + + ".p2align 2 \n" + "99: \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+rm"(dst_width), // %2 + "+r"(x0), // %3 + "+r"(x1) // %4 + : "rm"(x), // %5 + "rm"(dx) // %6 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/chromium/third_party/libyuv/source/scale_win.cc b/chromium/third_party/libyuv/source/scale_win.cc new file mode 100644 index 00000000000..76f5f4b4b4f --- /dev/null +++ b/chromium/third_party/libyuv/source/scale_win.cc @@ -0,0 +1,1289 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +// Offsets for source bytes 0 to 9 +static uvec8 kShuf0 = + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. +static uvec8 kShuf1 = + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf2 = + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Offsets for source bytes 0 to 10 +static uvec8 kShuf01 = + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; + +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. +static uvec8 kShuf11 = + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; + +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. +static uvec8 kShuf21 = + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; + +// Coefficients for source bytes 0 to 10 +static uvec8 kMadd01 = + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; + +// Coefficients for source bytes 10 to 21 +static uvec8 kMadd11 = + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; + +// Coefficients for source bytes 21 to 31 +static uvec8 kMadd21 = + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; + +// Coefficients for source bytes 21 to 31 +static vec16 kRound34 = + { 2, 2, 2, 2, 2, 2, 2, 2 }; + +static uvec8 kShuf38a = + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +static uvec8 kShuf38b = + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 0,1,2 +static uvec8 kShufAc = + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; + +// Arrange words 0,3,6 into 3,4,5 +static uvec8 kShufAc3 = + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x3 and 2x3 +static uvec16 kScaleAc33 = + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; + +// Arrange first value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb0 = + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; + +// Arrange second value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb1 = + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; + +// Arrange third value for pixels 0,1,2,3,4,5 +static uvec8 kShufAb2 = + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; + +// Scaling values for boxes of 3x2 and 2x2 +static uvec16 kScaleAb2 = + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x1 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 4 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Point samples 32 pixels to 8 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + psrld xmm5, 24 + pslld xmm5, 16 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + psrlw xmm0, 8 + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + ret + } +} + +// Blends 32x4 rectangle to 8x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff + psrlw xmm7, 8 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea eax, [eax + 32] + pavgb xmm2, xmm4 + pavgb xmm3, xmm5 + pavgb xmm0, xmm2 + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm7 + pand xmm3, xmm7 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + movdqa xmm2, xmm0 // average columns (16 to 8 pixels) + psrlw xmm0, 8 + pand xmm2, xmm7 + pavgw xmm0, xmm2 + packuswb xmm0, xmm0 + + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg wloop + + pop edi + pop esi + ret + } +} + +// Point samples 32 pixels to 24 pixels. +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm3, kShuf0 + movdqa xmm4, kShuf1 + movdqa xmm5, kShuf2 + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm1 + palignr xmm1, xmm0, 8 + pshufb xmm0, xmm3 + pshufb xmm1, xmm4 + pshufb xmm2, xmm5 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] + sub ecx, 24 + jg wloop + + ret + } +} + +// Blends 32x2 rectangle to 24x1 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. +// Then shuffled to do the scaling. + +// Register usage: +// xmm0 src_row 0 +// xmm1 src_row 1 +// xmm2 shuf 0 +// xmm3 shuf 1 +// xmm4 shuf 2 +// xmm5 madd 0 +// xmm6 madd 1 +// xmm7 kRound34 + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 4 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] + jg wloop + + pop esi + ret + } +} + +// Note that movdqa+palign may be better than movdqu. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShuf01 + movdqa xmm3, kShuf11 + movdqa xmm4, kShuf21 + movdqa xmm5, kMadd01 + movdqa xmm6, kMadd11 + movdqa xmm7, kRound34 + + align 4 + wloop: + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm2 + pmaddubsw xmm0, xmm5 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm3 + pmaddubsw xmm0, xmm6 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm1, xmm0 + pavgb xmm0, xmm1 + pshufb xmm0, xmm4 + movdqa xmm1, kMadd21 + pmaddubsw xmm0, xmm1 + paddsw xmm0, xmm7 + psrlw xmm0, 2 + packuswb xmm0, xmm0 + sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] + jg wloop + + pop esi + ret + } +} + +// 3/8 point sampler + +// Scale 32 pixels to 12 +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + movdqa xmm4, kShuf38a + movdqa xmm5, kShuf38b + + align 4 + xloop: + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] + pshufb xmm0, xmm4 + pshufb xmm1, xmm5 + paddusb xmm0, xmm1 + + sub ecx, 12 + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] + jg xloop + + ret + } +} + +// Scale 16x3 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAc + movdqa xmm3, kShufAc3 + movdqa xmm4, kScaleAc33 + pxor xmm5, xmm5 + + align 4 + xloop: + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqa xmm6, [eax + esi] + movhlps xmm1, xmm0 + movhlps xmm7, xmm6 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + movdqa xmm6, [eax + esi * 2] + lea eax, [eax + 16] + movhlps xmm7, xmm6 + punpcklbw xmm6, xmm5 + punpcklbw xmm7, xmm5 + paddusw xmm0, xmm6 + paddusw xmm1, xmm7 + + movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + psrldq xmm0, 2 + paddusw xmm6, xmm0 + pshufb xmm6, xmm2 + + movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + psrldq xmm1, 2 + paddusw xmm7, xmm1 + pshufb xmm7, xmm3 + paddusw xmm6, xmm7 + + pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 + packuswb xmm6, xmm6 + + sub ecx, 6 + movd [edx], xmm6 // write 6 pixels + psrlq xmm6, 16 + movd [edx + 2], xmm6 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Scale 16x2 pixels to 6x1 with interpolation +__declspec(naked) __declspec(align(16)) +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + movdqa xmm2, kShufAb0 + movdqa xmm3, kShufAb1 + movdqa xmm4, kShufAb2 + movdqa xmm5, kScaleAb2 + + align 4 + xloop: + movdqa xmm0, [eax] // average 2 rows into xmm0 + pavgb xmm0, [eax + esi] + lea eax, [eax + 16] + + movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 + pshufb xmm1, xmm2 + movdqa xmm6, xmm0 + pshufb xmm6, xmm3 + paddusw xmm1, xmm6 + pshufb xmm0, xmm4 + paddusw xmm1, xmm0 + + pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 + packuswb xmm1, xmm1 + + sub ecx, 6 + movd [edx], xmm1 // write 6 pixels + psrlq xmm1, 16 + movd [edx + 2], xmm1 + lea edx, [edx + 6] + jg xloop + + pop esi + ret + } +} + +// Reads 16xN bytes and produces 16 shorts at a time. +// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. +__declspec(naked) __declspec(align(16)) +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, + int src_height) { + __asm { + push esi + push edi + push ebx + push ebp + mov esi, [esp + 16 + 4] // src_ptr + mov edx, [esp + 16 + 8] // src_stride + mov edi, [esp + 16 + 12] // dst_ptr + mov ecx, [esp + 16 + 16] // dst_width + mov ebx, [esp + 16 + 20] // height + pxor xmm4, xmm4 + dec ebx + + align 4 + xloop: + // first row + movdqa xmm0, [esi] + lea eax, [esi + edx] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + lea esi, [esi + 16] + mov ebp, ebx + test ebp, ebp + je ydone + + // sum remaining rows + align 4 + yloop: + movdqa xmm2, [eax] // read 16 pixels + lea eax, [eax + edx] // advance to next row + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 + sub ebp, 1 + jg yloop + + align 4 + ydone: + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 + lea edi, [edi + 32] + + sub ecx, 16 + jg xloop + + pop ebp + pop ebx + pop edi + pop esi + ret + } +} + +// Bilinear column filtering. SSSE3 version. +// TODO(fbarchard): Port to Neon +// TODO(fbarchard): Switch the following: +// xor ebx, ebx +// mov bx, word ptr [esi + eax] // 2 source x0 pixels +// To +// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels +// when drmemory bug fixed. +// https://code.google.com/p/drmemory/issues/detail?id=1396 + +__declspec(naked) __declspec(align(16)) +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + __asm { + push ebx + push esi + push edi + mov edi, [esp + 12 + 4] // dst_ptr + mov esi, [esp + 12 + 8] // src_ptr + mov ecx, [esp + 12 + 12] // dst_width + movd xmm2, [esp + 12 + 16] // x + movd xmm3, [esp + 12 + 20] // dx + mov eax, 0x04040000 // shuffle to line up fractions with pixel. + movd xmm5, eax + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm1, 9 // 7 bit fractions. + movzx ebx, word ptr [esi + edx] // 2 source x1 pixels + movd xmm4, ebx + pshufb xmm1, xmm5 // 0011 + punpcklwd xmm0, xmm4 + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits, 2 pixels. + movd ebx, xmm0 + mov [edi], bx + lea edi, [edi + 2] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + movzx ebx, word ptr [esi + eax] // 2 source x0 pixels + movd xmm0, ebx + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // 16 bit + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // 8 bits + movd ebx, xmm0 + mov [edi], bl + + align 4 + xloop99: + + pop edi + pop esi + pop ebx + ret + } +} + +// Reads 16 pixels, duplicates them and writes 32 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int /* x */, int /* dx */) { + __asm { + mov edx, [esp + 4] // dst_ptr + mov eax, [esp + 8] // src_ptr + mov ecx, [esp + 12] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 + punpckhbw xmm1, xmm1 + sub ecx, 32 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + shufps xmm0, xmm1, 0xdd + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x1 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, + ptrdiff_t /* src_stride */, + uint8* dst_argb, int dst_width) { + __asm { + mov eax, [esp + 4] // src_argb + // src_stride ignored + mov edx, [esp + 12] // dst_argb + mov ecx, [esp + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + movdqa xmm2, xmm0 + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} + +// Blends 8x2 rectangle to 4x1. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + +// Reads 4 pixels at a time. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_argb + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_argb + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 4 + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_argb + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_argb + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 4 + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + +// Column scaling unfiltered. SSE2 version. +__declspec(naked) __declspec(align(16)) +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push edi + push esi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + + pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 + pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 + paddd xmm2, xmm0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 2 + pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 + paddd xmm2, xmm0 // x3 x2 x1 x0 + paddd xmm3, xmm3 // 0, 0, 0, dx * 4 + pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 + + pextrw eax, xmm2, 1 // get x0 integer. + pextrw edx, xmm2, 3 // get x1 integer. + + cmp ecx, 0 + jle xloop99 + sub ecx, 4 + jl xloop49 + + // 4 Pixel loop. + align 4 + xloop4: + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + pextrw edx, xmm2, 7 // get x3 integer. + paddd xmm2, xmm3 // x += dx + punpckldq xmm0, xmm1 // x0 x1 + + movd xmm1, [esi + eax * 4] // 1 source x2 pixels + movd xmm4, [esi + edx * 4] // 1 source x3 pixels + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + punpckldq xmm1, xmm4 // x2 x3 + punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 + sub ecx, 4 // 4 pixels + movdqu [edi], xmm0 + lea edi, [edi + 16] + jge xloop4 + + align 4 + xloop49: + test ecx, 2 + je xloop29 + + // 2 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x0 pixels + movd xmm1, [esi + edx * 4] // 1 source x1 pixels + pextrw eax, xmm2, 5 // get x2 integer. + punpckldq xmm0, xmm1 // x0 x1 + + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + + xloop29: + test ecx, 1 + je xloop99 + + // 1 Pixels. + movd xmm0, [esi + eax * 4] // 1 source x2 pixels + movd dword ptr [edi], xmm0 + align 4 + xloop99: + + pop esi + pop edi + ret + } +} + +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. +// TODO(fbarchard): Port to Neon + +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw +static uvec8 kShuffleColARGB = { + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel +}; + +// Shuffle table for duplicating 2 fractions into 8 bytes each +static uvec8 kShuffleFractions = { + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, +}; + +__declspec(naked) __declspec(align(16)) +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov ecx, [esp + 8 + 12] // dst_width + movd xmm2, [esp + 8 + 16] // x + movd xmm3, [esp + 8 + 20] // dx + movdqa xmm4, kShuffleColARGB + movdqa xmm5, kShuffleFractions + pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. + psrlw xmm6, 9 + pextrw eax, xmm2, 1 // get x0 integer. preroll + sub ecx, 2 + jl xloop29 + + movdqa xmm0, xmm2 // x1 = x0 + dx + paddd xmm0, xmm3 + punpckldq xmm2, xmm0 // x0 x1 + punpckldq xmm3, xmm3 // dx dx + paddd xmm3, xmm3 // dx * 2, dx * 2 + pextrw edx, xmm2, 3 // get x1 integer. preroll + + // 2 Pixel loop. + align 4 + xloop2: + movdqa xmm1, xmm2 // x0, x1 fractions. + paddd xmm2, xmm3 // x += dx + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + psrlw xmm1, 9 // 7 bit fractions. + movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels + pshufb xmm1, xmm5 // 0000000011111111 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm1, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. + pextrw eax, xmm2, 1 // get x0 integer. next iteration. + pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. + packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. + movq qword ptr [edi], xmm0 + lea edi, [edi + 8] + sub ecx, 2 // 2 pixels + jge xloop2 + + align 4 + xloop29: + + add ecx, 2 - 1 + jl xloop99 + + // 1 pixel remainder + psrlw xmm2, 9 // 7 bit fractions. + movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels + pshufb xmm2, xmm5 // 00000000 + pshufb xmm0, xmm4 // arrange pixels into pairs + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. + psrlw xmm0, 7 + packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. + movd [edi], xmm0 + + align 4 + xloop99: + + pop edi + pop esi + ret + } +} + +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +__declspec(naked) __declspec(align(16)) +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int /* x */, int /* dx */) { + __asm { + mov edx, [esp + 4] // dst_argb + mov eax, [esp + 8] // src_argb + mov ecx, [esp + 12] // dst_width + + align 4 + wloop: + movdqa xmm0, [eax] + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpckldq xmm0, xmm0 + punpckhdq xmm1, xmm1 + sub ecx, 8 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg wloop + + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py index f93e97bb71f..1b912b8ba82 100755 --- a/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py +++ b/chromium/third_party/libyuv/tools/valgrind-libyuv/libyuv_tests.py @@ -73,9 +73,10 @@ class LibyuvTest(chrome_tests.ChromeTests): def main(_): parser = optparse.OptionParser('usage: %prog -b <dir> -t <test> <test args>') parser.disable_interspersed_args() - parser.add_option('-b', '--build_dir', + parser.add_option('-b', '--build-dir', help=('Location of the compiler output. Can only be used ' 'when the test argument does not contain this path.')) + parser.add_option("--target", help="Debug or Release") parser.add_option('-t', '--test', help='Test to run.') parser.add_option('', '--baseline', action='store_true', default=False, help='Generate baseline data instead of validating') @@ -104,6 +105,11 @@ def main(_): if not options.test: parser.error('--test not specified') + # Support build dir both with and without the target. + if (options.target and options.build_dir and + not options.build_dir.endswith(options.target)): + options.build_dir = os.path.join(options.build_dir, options.target) + # If --build_dir is provided, prepend it to the test executable if needed. test_executable = options.test if options.build_dir and not test_executable.startswith(options.build_dir): diff --git a/chromium/third_party/libyuv/unit_test/compare_test.cc b/chromium/third_party/libyuv/unit_test/compare_test.cc index 7fe6c3b0b19..efc2e39e68f 100644 --- a/chromium/third_party/libyuv/unit_test/compare_test.cc +++ b/chromium/third_party/libyuv/unit_test/compare_test.cc @@ -39,7 +39,7 @@ TEST_F(libyuvTest, Djb2_Test) { " and feels as if he were in the seventh heaven of typography" " together with Hermann Zapf"; uint32 foxhash = HashDjb2(reinterpret_cast<const uint8*>(fox), 131, 5381); - const uint32 kExpectedFoxHash = 2611006483; + const uint32 kExpectedFoxHash = 2611006483u; EXPECT_EQ(kExpectedFoxHash, foxhash); for (int i = 0; i < kMaxTest; ++i) { @@ -286,9 +286,9 @@ TEST_F(libyuvTest, Psnr) { src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); - EXPECT_GT(err, 4.0); + EXPECT_GT(err, 2.0); if (kSrcWidth * kSrcHeight >= 256) { - EXPECT_LT(err, 5.0); + EXPECT_LT(err, 6.0); } srandom(time(NULL)); @@ -322,7 +322,7 @@ TEST_F(libyuvTest, Psnr) { free_aligned_buffer_64(src_b) } -TEST_F(libyuvTest, BenchmarkSsim_Opt) { +TEST_F(libyuvTest, DISABLED_BenchmarkSsim_Opt) { align_buffer_64(src_a, benchmark_width_ * benchmark_height_) align_buffer_64(src_b, benchmark_width_ * benchmark_height_) for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { diff --git a/chromium/third_party/libyuv/unit_test/convert_test.cc b/chromium/third_party/libyuv/unit_test/convert_test.cc index 7e96c63a4d5..d5eaca0569b 100644 --- a/chromium/third_party/libyuv/unit_test/convert_test.cc +++ b/chromium/third_party/libyuv/unit_test/convert_test.cc @@ -1,990 +1,997 @@ -/*
- * Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <time.h>
-
-#include "libyuv/compare.h"
-#include "libyuv/convert.h"
-#include "libyuv/convert_argb.h"
-#include "libyuv/convert_from.h"
-#include "libyuv/convert_from_argb.h"
-#include "libyuv/cpu_id.h"
-#include "libyuv/format_conversion.h"
-#ifdef HAVE_JPEG
-#include "libyuv/mjpeg_decoder.h"
-#endif
-#include "libyuv/planar_functions.h"
-#include "libyuv/rotate.h"
-#include "../unit_test/unit_test.h"
-
-#if defined(_MSC_VER)
-#define SIMD_ALIGNED(var) __declspec(align(16)) var
-#else // __GNUC__
-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
-#endif
-
-namespace libyuv {
-
-#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a))
-
-#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_u, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(src_v, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- } \
- } \
- MaskCpuFlags(0); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_u_c) \
- free_aligned_buffer_64(dst_v_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_u_opt) \
- free_aligned_buffer_64(dst_v_opt) \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_u) \
- free_aligned_buffer_64(src_v) \
-}
-
-#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOP(I420, 2, 2, I420, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I420, 2, 2)
-TESTPLANARTOP(I444, 1, 1, I420, 2, 2)
-TESTPLANARTOP(I411, 4, 1, I420, 2, 2)
-TESTPLANARTOP(I420, 2, 2, I422, 2, 1)
-TESTPLANARTOP(I420, 2, 2, I444, 1, 1)
-TESTPLANARTOP(I420, 2, 2, I411, 4, 1)
-TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2)
-TESTPLANARTOP(I422, 2, 1, I422, 2, 1)
-TESTPLANARTOP(I444, 1, 1, I444, 1, 1)
-
-#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_u, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(src_v, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- } \
- } \
- MaskCpuFlags(0); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_u + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- src_v + OFF, \
- SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_uv_opt, \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_uv_opt[i * \
- SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_uv_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_uv_opt) \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_u) \
- free_aligned_buffer_64(src_v) \
-}
-
-#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2)
-TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2)
-
-#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
- for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
- src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
- (random() & 0xff); \
- } \
- } \
- MaskCpuFlags(0); \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \
- src_uv + OFF, \
- 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 1); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_u_c) \
- free_aligned_buffer_64(dst_v_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_u_opt) \
- free_aligned_buffer_64(dst_v_opt) \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_uv) \
-}
-
-#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
- FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
-TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
-
-#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = ((kWidth * BPP_B + ALIGN - 1) / ALIGN) * ALIGN; \
- const int kSizeUV = \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_u, kSizeUV + OFF); \
- align_buffer_64(src_v, kSizeUV + OFF); \
- align_buffer_64(dst_argb_c, kStrideB * kHeight); \
- align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
- memset(dst_argb_c, 0, kStrideB * kHeight); \
- memset(dst_argb_opt, 0, kStrideB * kHeight); \
- srandom(time(NULL)); \
- for (int i = 0; i < kWidth * kHeight; ++i) { \
- src_y[i + OFF] = (random() & 0xff); \
- } \
- for (int i = 0; i < kSizeUV; ++i) { \
- src_u[i + OFF] = (random() & 0xff); \
- src_v[i + OFF] = (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_argb_c, kStrideB, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_argb_opt, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_64(dst_argb32_c, kWidth * BPP_C * kHeight); \
- align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_c, 0, kWidth * BPP_C * kHeight); \
- memset(dst_argb32_opt, 0, kWidth * BPP_C * kHeight); \
- FMT_B##To##FMT_C(dst_argb_c, kStrideB, \
- dst_argb32_c, kWidth * BPP_C , \
- kWidth, kHeight); \
- FMT_B##To##FMT_C(dst_argb_opt, kStrideB, \
- dst_argb32_opt, kWidth * BPP_C , \
- kWidth, kHeight); \
- for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i]) - \
- static_cast<int>(dst_argb32_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_u) \
- free_aligned_buffer_64(src_v) \
- free_aligned_buffer_64(dst_argb_c) \
- free_aligned_buffer_64(dst_argb_opt) \
- free_aligned_buffer_64(dst_argb32_c) \
- free_aligned_buffer_64(dst_argb32_opt) \
-}
-
-#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- DIFF, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \
- TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
- benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
-
-TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 9, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 17, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 0, ARGB, 4)
-TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 0, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 0, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 1, 2, ARGB, 4)
-TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 1, 2, ARGB, 4)
-
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideB = kWidth * BPP_B; \
- align_buffer_64(src_y, kWidth * kHeight + OFF); \
- align_buffer_64(src_uv, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \
- align_buffer_64(dst_argb_c, kStrideB * kHeight); \
- align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kWidth; ++j) \
- src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
- src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] = \
- (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- dst_argb_c, kWidth * BPP_B, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \
- src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- dst_argb_opt, kWidth * BPP_B, \
- kWidth, NEG kHeight); \
- } \
- /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
- align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \
- align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \
- memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \
- memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \
- FMT_B##ToARGB(dst_argb_c, kStrideB, \
- dst_argb32_c, kWidth * 4, \
- kWidth, kHeight); \
- FMT_B##ToARGB(dst_argb_opt, kStrideB, \
- dst_argb32_opt, kWidth * 4, \
- kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth * 4; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \
- static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(src_y) \
- free_aligned_buffer_64(src_uv) \
- free_aligned_buffer_64(dst_argb_c) \
- free_aligned_buffer_64(dst_argb_opt) \
- free_aligned_buffer_64(dst_argb32_c) \
- free_aligned_buffer_64(dst_argb32_opt) \
-}
-
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
-TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9)
-
-#define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
- align_buffer_64(src_argb, kStride * kHeight + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_u_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_u_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_v_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_c, 1, kWidth * kHeight); \
- memset(dst_u_c, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_c, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_y_opt, 2, kWidth * kHeight); \
- memset(dst_u_opt, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- memset(dst_v_opt, 0, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_c, kWidth, \
- dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_opt, kWidth, \
- dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_u_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_u_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_v_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \
- static_cast<int>(dst_v_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_u_c) \
- free_aligned_buffer_64(dst_v_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_u_opt) \
- free_aligned_buffer_64(dst_v_opt) \
- free_aligned_buffer_64(src_argb) \
-}
-
-#define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, DIFF) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, DIFF, _Opt, +, 0)
-
-TESTATOPLANAR(ARGB, 4, I420, 2, 2, 4)
-#ifdef __arm__
-TESTATOPLANAR(ARGB, 4, J420, 2, 2, 4)
-#else
-TESTATOPLANAR(ARGB, 4, J420, 2, 2, 0)
-#endif
-TESTATOPLANAR(BGRA, 4, I420, 2, 2, 4)
-TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4)
-TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4)
-TESTATOPLANAR(RAW, 3, I420, 2, 2, 4)
-TESTATOPLANAR(RGB24, 3, I420, 2, 2, 4)
-TESTATOPLANAR(RGB565, 2, I420, 2, 2, 5)
-// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
-TESTATOPLANAR(ARGB1555, 2, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, I420, 2, 2, 17)
-TESTATOPLANAR(ARGB, 4, I411, 4, 1, 4)
-TESTATOPLANAR(ARGB, 4, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, I444, 1, 1, 2)
-TESTATOPLANAR(YUY2, 2, I420, 2, 2, 2)
-TESTATOPLANAR(UYVY, 2, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, I422, 2, 1, 2)
-TESTATOPLANAR(UYVY, 2, I422, 2, 1, 2)
-TESTATOPLANAR(I400, 1, I420, 2, 2, 2)
-TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerGBRG, 1, I420, 2, 2, 4)
-TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2, 4)
-
-#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- W1280, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \
- align_buffer_64(src_argb, kStride * kHeight + OFF); \
- align_buffer_64(dst_y_c, kWidth * kHeight); \
- align_buffer_64(dst_uv_c, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- align_buffer_64(dst_y_opt, kWidth * kHeight); \
- align_buffer_64(dst_uv_opt, \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
- SUBSAMPLE(kHeight, SUBSAMP_Y)); \
- srandom(time(NULL)); \
- for (int i = 0; i < kHeight; ++i) \
- for (int j = 0; j < kStride; ++j) \
- src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_c, kWidth, \
- dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \
- dst_y_opt, kWidth, \
- dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kHeight; ++i) { \
- for (int j = 0; j < kWidth; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
- static_cast<int>(dst_y_opt[i * kWidth + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
- for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
- int abs_diff = \
- abs(static_cast<int>(dst_uv_c[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
- static_cast<int>(dst_uv_opt[i * \
- SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- } \
- EXPECT_LE(max_diff, 4); \
- free_aligned_buffer_64(dst_y_c) \
- free_aligned_buffer_64(dst_uv_c) \
- free_aligned_buffer_64(dst_y_opt) \
- free_aligned_buffer_64(dst_uv_opt) \
- free_aligned_buffer_64(src_argb) \
-}
-
-#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_ - 4, _Any, +, 0) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Unaligned, +, 1) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Invert, -, 0) \
- TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
- benchmark_width_, _Opt, +, 0)
-
-TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2)
-TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2)
-
-#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- W1280, DIFF, N, NEG, OFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \
- const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
- const int kHeight = benchmark_height_; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \
- align_buffer_64(src_argb, kStrideA * kHeight + OFF); \
- align_buffer_64(dst_argb_c, kStrideB * kHeight); \
- align_buffer_64(dst_argb_opt, kStrideB * kHeight); \
- memset(dst_argb_c, 0, kStrideB * kHeight); \
- memset(dst_argb_opt, 0, kStrideB * kHeight); \
- srandom(time(NULL)); \
- for (int i = 0; i < kStrideA * kHeight; ++i) { \
- src_argb[i + OFF] = (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
- dst_argb_c, kStrideB, \
- kWidth, NEG kHeight); \
- MaskCpuFlags(-1); \
- for (int i = 0; i < benchmark_iterations_; ++i) { \
- FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \
- dst_argb_opt, kStrideB, \
- kWidth, NEG kHeight); \
- } \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeight; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_64(src_argb) \
- free_aligned_buffer_64(dst_argb_c) \
- free_aligned_buffer_64(dst_argb_opt) \
-}
-
-#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
-TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \
- srandom(time(NULL)); \
- for (int times = 0; times < benchmark_iterations_; ++times) { \
- const int kWidth = (random() & 63) + 1; \
- const int kHeight = (random() & 31) + 1; \
- const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
- const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \
- const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\
- const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\
- align_buffer_page_end(src_argb, kStrideA * kHeightA); \
- align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \
- align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \
- memset(dst_argb_c, 0, kStrideB * kHeightB); \
- memset(dst_argb_opt, 0, kStrideB * kHeightB); \
- for (int i = 0; i < kStrideA * kHeightA; ++i) { \
- src_argb[i] = (random() & 0xff); \
- } \
- MaskCpuFlags(0); \
- FMT_A##To##FMT_B(src_argb, kStrideA, \
- dst_argb_c, kStrideB, \
- kWidth, kHeight); \
- MaskCpuFlags(-1); \
- FMT_A##To##FMT_B(src_argb, kStrideA, \
- dst_argb_opt, kStrideB, \
- kWidth, kHeight); \
- int max_diff = 0; \
- for (int i = 0; i < kStrideB * kHeightB; ++i) { \
- int abs_diff = \
- abs(static_cast<int>(dst_argb_c[i]) - \
- static_cast<int>(dst_argb_opt[i])); \
- if (abs_diff > max_diff) { \
- max_diff = abs_diff; \
- } \
- } \
- EXPECT_LE(max_diff, DIFF); \
- free_aligned_buffer_page_end(src_argb) \
- free_aligned_buffer_page_end(dst_argb_c) \
- free_aligned_buffer_page_end(dst_argb_opt) \
- } \
-}
-
-#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_ - 4, DIFF, _Any, +, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_, DIFF, _Unaligned, +, 1) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_, DIFF, _Invert, -, 0) \
- TESTATOBI(FMT_A, BPP_A, STRIDE_A, \
- FMT_B, BPP_B, STRIDE_B, \
- benchmark_width_, DIFF, _Opt, +, 0) \
- TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \
- FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF)
-
-TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
-TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4)
-TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4)
-TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
-
-TEST_F(libyuvTest, Test565) {
- SIMD_ALIGNED(uint8 orig_pixels[256][4]);
- SIMD_ALIGNED(uint8 pixels565[256][2]);
-
- for (int i = 0; i < 256; ++i) {
- for (int j = 0; j < 4; ++j) {
- orig_pixels[i][j] = i;
- }
- }
- ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1);
- uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381);
- EXPECT_EQ(610919429u, checksum);
-}
-
-#ifdef HAVE_JPEG
-TEST_F(libyuvTest, ValidateJpeg) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_64(orig_pixels, kSize);
-
- // No SOI or EOI. Expect fail.
- memset(orig_pixels, 0, kSize);
-
- // EOI, SOI. Expect pass.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
- for (int times = 0; times < benchmark_iterations_; ++times) {
- EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize));
- }
- free_aligned_buffer_page_end(orig_pixels);
-}
-
-TEST_F(libyuvTest, InvalidateJpeg) {
- const int kOff = 10;
- const int kMinJpeg = 64;
- const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ?
- benchmark_width_ * benchmark_height_ : kMinJpeg;
- const int kSize = kImageSize + kOff;
- align_buffer_64(orig_pixels, kSize);
-
- // No SOI or EOI. Expect fail.
- memset(orig_pixels, 0, kSize);
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- // SOI but no EOI. Expect fail.
- orig_pixels[0] = 0xff;
- orig_pixels[1] = 0xd8; // SOI.
- for (int times = 0; times < benchmark_iterations_; ++times) {
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
- }
- // EOI but no SOI. Expect fail.
- orig_pixels[0] = 0;
- orig_pixels[1] = 0;
- orig_pixels[kSize - kOff + 0] = 0xff;
- orig_pixels[kSize - kOff + 1] = 0xd9; // EOI.
- EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
-
- free_aligned_buffer_page_end(orig_pixels);
-}
-
-#endif
-
-} // namespace libyuv
+/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <time.h> + +#include "libyuv/compare.h" +#include "libyuv/convert.h" +#include "libyuv/convert_argb.h" +#include "libyuv/convert_from.h" +#include "libyuv/convert_from_argb.h" +#include "libyuv/cpu_id.h" +#include "libyuv/format_conversion.h" +#ifdef HAVE_JPEG +#include "libyuv/mjpeg_decoder.h" +#endif +#include "libyuv/planar_functions.h" +#include "libyuv/rotate.h" +#include "../unit_test/unit_test.h" + +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#else // __GNUC__ +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif + +namespace libyuv { + +#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) + +#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ +TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_64(src_y, kWidth * kHeight + OFF); \ + align_buffer_64(src_u, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ + align_buffer_64(src_v, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ + align_buffer_64(dst_y_c, kWidth * kHeight); \ + align_buffer_64(dst_u_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_v_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_y_opt, kWidth * kHeight); \ + align_buffer_64(dst_u_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_v_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (random() & 0xff); \ + src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (random() & 0xff); \ + } \ + } \ + MaskCpuFlags(0); \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_u + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + dst_y_c, kWidth, \ + dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_u + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + dst_y_opt, kWidth, \ + dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 0); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_u_c[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>(dst_u_opt[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_v_c[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>(dst_v_opt[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + free_aligned_buffer_64(dst_y_c) \ + free_aligned_buffer_64(dst_u_c) \ + free_aligned_buffer_64(dst_v_c) \ + free_aligned_buffer_64(dst_y_opt) \ + free_aligned_buffer_64(dst_u_opt) \ + free_aligned_buffer_64(dst_v_opt) \ + free_aligned_buffer_64(src_y) \ + free_aligned_buffer_64(src_u) \ + free_aligned_buffer_64(src_v) \ +} + +#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ - 4, _Any, +, 0) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 1) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0) \ + TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) + +TESTPLANARTOP(I420, 2, 2, I420, 2, 2) +TESTPLANARTOP(I422, 2, 1, I420, 2, 2) +TESTPLANARTOP(I444, 1, 1, I420, 2, 2) +TESTPLANARTOP(I411, 4, 1, I420, 2, 2) +TESTPLANARTOP(I420, 2, 2, I422, 2, 1) +TESTPLANARTOP(I420, 2, 2, I444, 1, 1) +TESTPLANARTOP(I420, 2, 2, I411, 4, 1) +TESTPLANARTOP(I420, 2, 2, I420Mirror, 2, 2) +TESTPLANARTOP(I422, 2, 1, I422, 2, 1) +TESTPLANARTOP(I444, 1, 1, I444, 1, 1) + +#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ +TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_64(src_y, kWidth * kHeight + OFF); \ + align_buffer_64(src_u, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ + align_buffer_64(src_v, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ + align_buffer_64(dst_y_c, kWidth * kHeight); \ + align_buffer_64(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_y_opt, kWidth * kHeight); \ + align_buffer_64(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (random() & 0xff); \ + src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (random() & 0xff); \ + } \ + } \ + MaskCpuFlags(0); \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_u + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + dst_y_c, kWidth, \ + dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_u + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, \ + SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + dst_y_opt, kWidth, \ + dst_uv_opt, \ + SUBSAMPLE(kWidth * 2, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_uv_c[i * \ + SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \ + static_cast<int>(dst_uv_opt[i * \ + SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + free_aligned_buffer_64(dst_y_c) \ + free_aligned_buffer_64(dst_uv_c) \ + free_aligned_buffer_64(dst_y_opt) \ + free_aligned_buffer_64(dst_uv_opt) \ + free_aligned_buffer_64(src_y) \ + free_aligned_buffer_64(src_u) \ + free_aligned_buffer_64(src_v) \ +} + +#define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ - 4, _Any, +, 0) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 1) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0) \ + TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) + +TESTPLANARTOBP(I420, 2, 2, NV12, 2, 2) +TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2) + +#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ +TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_64(src_y, kWidth * kHeight + OFF); \ + align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + OFF); \ + align_buffer_64(dst_y_c, kWidth * kHeight); \ + align_buffer_64(dst_u_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_v_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_y_opt, kWidth * kHeight); \ + align_buffer_64(dst_u_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_v_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (random() & 0xff); \ + } \ + } \ + MaskCpuFlags(0); \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_uv + OFF, \ + 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + dst_y_c, kWidth, \ + dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR(src_y + OFF, kWidth, \ + src_uv + OFF, \ + 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + dst_y_opt, kWidth, \ + dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_u_c[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>(dst_u_opt[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_v_c[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>(dst_v_opt[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + free_aligned_buffer_64(dst_y_c) \ + free_aligned_buffer_64(dst_u_c) \ + free_aligned_buffer_64(dst_v_c) \ + free_aligned_buffer_64(dst_y_opt) \ + free_aligned_buffer_64(dst_u_opt) \ + free_aligned_buffer_64(dst_v_opt) \ + free_aligned_buffer_64(src_y) \ + free_aligned_buffer_64(src_uv) \ +} + +#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ - 4, _Any, +, 0) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 1) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0) \ + TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) + +TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2) +TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2) + +#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN)) + +#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \ +TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kSizeUV = \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + align_buffer_64(src_y, kWidth * kHeight + OFF); \ + align_buffer_64(src_u, kSizeUV + OFF); \ + align_buffer_64(src_v, kSizeUV + OFF); \ + align_buffer_64(dst_argb_c, kStrideB * kHeight); \ + align_buffer_64(dst_argb_opt, kStrideB * kHeight); \ + memset(dst_argb_c, 0, kStrideB * kHeight); \ + memset(dst_argb_opt, 0, kStrideB * kHeight); \ + srandom(time(NULL)); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + src_y[i + OFF] = (random() & 0xff); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + src_u[i + OFF] = (random() & 0xff); \ + src_v[i + OFF] = (random() & 0xff); \ + } \ + MaskCpuFlags(0); \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ + src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_argb_c, kStrideB, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ + src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_argb_opt, kStrideB, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ + align_buffer_64(dst_argb32_c, kWidth * BPP_C * kHeight); \ + align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \ + memset(dst_argb32_c, 0, kWidth * BPP_C * kHeight); \ + memset(dst_argb32_opt, 0, kWidth * BPP_C * kHeight); \ + FMT_B##To##FMT_C(dst_argb_c, kStrideB, \ + dst_argb32_c, kWidth * BPP_C , \ + kWidth, kHeight); \ + FMT_B##To##FMT_C(dst_argb_opt, kStrideB, \ + dst_argb32_opt, kWidth * BPP_C , \ + kWidth, kHeight); \ + for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb32_c[i]) - \ + static_cast<int>(dst_argb32_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_64(src_y) \ + free_aligned_buffer_64(src_u) \ + free_aligned_buffer_64(src_v) \ + free_aligned_buffer_64(dst_argb_c) \ + free_aligned_buffer_64(dst_argb_opt) \ + free_aligned_buffer_64(dst_argb32_c) \ + free_aligned_buffer_64(dst_argb32_opt) \ +} + +#define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, DIFF, FMT_C, BPP_C) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \ + TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) + +// TODO(fbarchard): Make vertical alignment unnecessary on bayer. +TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1, 9, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1, 17, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I411, 4, 1, ARGB, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1, 1, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, 1, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1, 0, ARGB, 4) +TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1, 0, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1, 0, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 2, 2, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 2, 2, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 2, 2, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 2, 2, 2, ARGB, 4) + +#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + W1280, DIFF, N, NEG, OFF) \ +TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kStrideB = kWidth * BPP_B; \ + align_buffer_64(src_y, kWidth * kHeight + OFF); \ + align_buffer_64(src_uv, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \ + align_buffer_64(dst_argb_c, kStrideB * kHeight); \ + align_buffer_64(dst_argb_opt, kStrideB * kHeight); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j + OFF] = (random() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \ + src_uv[(i * SUBSAMPLE(kWidth, SUBSAMP_X)) * 2 + j + OFF] = \ + (random() & 0xff); \ + } \ + MaskCpuFlags(0); \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ + src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \ + dst_argb_c, kWidth * BPP_B, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, \ + src_uv + OFF, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \ + dst_argb_opt, kWidth * BPP_B, \ + kWidth, NEG kHeight); \ + } \ + /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ + align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight); \ + align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight); \ + memset(dst_argb32_c, 1, kWidth * 4 * kHeight); \ + memset(dst_argb32_opt, 2, kWidth * 4 * kHeight); \ + FMT_B##ToARGB(dst_argb_c, kStrideB, \ + dst_argb32_c, kWidth * 4, \ + kWidth, kHeight); \ + FMT_B##ToARGB(dst_argb_opt, kStrideB, \ + dst_argb32_opt, kWidth * 4, \ + kWidth, kHeight); \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth * 4; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb32_c[i * kWidth * 4 + j]) - \ + static_cast<int>(dst_argb32_opt[i * kWidth * 4 + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_64(src_y) \ + free_aligned_buffer_64(src_uv) \ + free_aligned_buffer_64(dst_argb_c) \ + free_aligned_buffer_64(dst_argb_opt) \ + free_aligned_buffer_64(dst_argb32_c) \ + free_aligned_buffer_64(dst_argb32_opt) \ +} + +#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_, DIFF, _Invert, -, 0) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + benchmark_width_, DIFF, _Opt, +, 0) + +TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2) +TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2) +TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9) +TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9) + +#define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + W1280, DIFF, N, NEG, OFF) \ +TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStride = \ + (SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ + align_buffer_64(src_argb, kStride * kHeight + OFF); \ + align_buffer_64(dst_y_c, kWidth * kHeight); \ + align_buffer_64(dst_u_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_v_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_y_opt, kWidth * kHeight); \ + align_buffer_64(dst_u_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_v_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_c, 1, kWidth * kHeight); \ + memset(dst_u_c, 0, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_c, 0, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 2, kWidth * kHeight); \ + memset(dst_u_opt, 0, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_opt, 0, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \ + MaskCpuFlags(0); \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ + dst_y_c, kWidth, \ + dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ + dst_y_opt, kWidth, \ + dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ + dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_u_c[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>(dst_u_opt[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_v_c[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast<int>(dst_v_opt[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_64(dst_y_c) \ + free_aligned_buffer_64(dst_u_c) \ + free_aligned_buffer_64(dst_v_c) \ + free_aligned_buffer_64(dst_y_opt) \ + free_aligned_buffer_64(dst_u_opt) \ + free_aligned_buffer_64(dst_v_opt) \ + free_aligned_buffer_64(src_argb) \ +} + +#define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + DIFF) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, DIFF, _Invert, -, 0) \ + TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, DIFF, _Opt, +, 0) + +TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) +#ifdef __arm__ +TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 4) +#else +TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, 0) +#endif +TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4) +TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4) +TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4) +TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4) +TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4) +TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) +// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9. +TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15) +TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17) +TESTATOPLANAR(ARGB, 4, 1, I411, 4, 1, 4) +TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2) +TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2) +TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2) +TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2) +TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2) +TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2) +TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2) +TESTATOPLANAR(BayerBGGR, 1, 2, I420, 2, 2, 4) +TESTATOPLANAR(BayerRGGB, 1, 2, I420, 2, 2, 4) +TESTATOPLANAR(BayerGBRG, 1, 2, I420, 2, 2, 4) +TESTATOPLANAR(BayerGRBG, 1, 2, I420, 2, 2, 4) + +#define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + W1280, N, NEG, OFF) \ +TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \ + align_buffer_64(src_argb, kStride * kHeight + OFF); \ + align_buffer_64(dst_y_c, kWidth * kHeight); \ + align_buffer_64(dst_uv_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_64(dst_y_opt, kWidth * kHeight); \ + align_buffer_64(dst_uv_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j + OFF] = (random() & 0xff); \ + MaskCpuFlags(0); \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ + dst_y_c, kWidth, \ + dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, \ + dst_y_opt, kWidth, \ + dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \ + static_cast<int>(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 4); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \ + int abs_diff = \ + abs(static_cast<int>(dst_uv_c[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \ + static_cast<int>(dst_uv_opt[i * \ + SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 4); \ + free_aligned_buffer_64(dst_y_c) \ + free_aligned_buffer_64(dst_uv_c) \ + free_aligned_buffer_64(dst_y_opt) \ + free_aligned_buffer_64(dst_uv_opt) \ + free_aligned_buffer_64(src_argb) \ +} + +#define TESTATOBIPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ + TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_ - 4, _Any, +, 0) \ + TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Unaligned, +, 1) \ + TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Invert, -, 0) \ + TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ + benchmark_width_, _Opt, +, 0) + +TESTATOBIPLANAR(ARGB, 4, NV12, 2, 2) +TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2) + +#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + W1280, DIFF, N, NEG, OFF) \ +TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_64(src_argb, kStrideA * kHeightA + OFF); \ + align_buffer_64(dst_argb_c, kStrideB * kHeightB); \ + align_buffer_64(dst_argb_opt, kStrideB * kHeightB); \ + memset(dst_argb_c, 0, kStrideB * kHeightB); \ + memset(dst_argb_opt, 0, kStrideB * kHeightB); \ + srandom(time(NULL)); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i + OFF] = (random() & 0xff); \ + } \ + MaskCpuFlags(0); \ + FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \ + dst_argb_c, kStrideB, \ + kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B(src_argb + OFF, kStrideA, \ + dst_argb_opt, kStrideB, \ + kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_64(src_argb) \ + free_aligned_buffer_64(dst_argb_c) \ + free_aligned_buffer_64(dst_argb_opt) \ +} + +#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ +TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \ + srandom(time(NULL)); \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (random() & 63) + 1; \ + const int kHeight = (random() & 31) + 1; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\ + const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\ + align_buffer_page_end(src_argb, kStrideA * kHeightA); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ + memset(dst_argb_c, 0, kStrideB * kHeightB); \ + memset(dst_argb_opt, 0, kStrideB * kHeightB); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i] = (random() & 0xff); \ + } \ + MaskCpuFlags(0); \ + FMT_A##To##FMT_B(src_argb, kStrideA, \ + dst_argb_c, kStrideB, \ + kWidth, kHeight); \ + MaskCpuFlags(-1); \ + FMT_A##To##FMT_B(src_argb, kStrideA, \ + dst_argb_opt, kStrideB, \ + kWidth, kHeight); \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = \ + abs(static_cast<int>(dst_argb_c[i]) - \ + static_cast<int>(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_argb) \ + free_aligned_buffer_page_end(dst_argb_c) \ + free_aligned_buffer_page_end(dst_argb_opt) \ + } \ +} + +#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_, DIFF, _Invert, -, 0) \ + TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_, DIFF, _Opt, +, 0) \ + TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) + +TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0) +TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0) +TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0) +TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0) +TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0) +TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0) +TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) +TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0) +TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0) +TESTATOB(ARGB, 4, 4, 1, BayerBGGR, 1, 2, 2, 0) +TESTATOB(ARGB, 4, 4, 1, BayerRGGB, 1, 2, 2, 0) +TESTATOB(ARGB, 4, 4, 1, BayerGBRG, 1, 2, 2, 0) +TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0) +TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4) +TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4) +TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2) +TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2) +TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0) +TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0) +TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0) +TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0) +TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0) +TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0) +TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0) +TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0) +TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, 4) +TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, 4) +TESTATOB(BayerBGGR, 1, 2, 2, ARGB, 4, 4, 1, 0) +TESTATOB(BayerRGGB, 1, 2, 2, ARGB, 4, 4, 1, 0) +TESTATOB(BayerGBRG, 1, 2, 2, ARGB, 4, 4, 1, 0) +TESTATOB(BayerGRBG, 1, 2, 2, ARGB, 4, 4, 1, 0) +TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0) +TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0) +TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0) +TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0) +TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0) + +TEST_F(libyuvTest, Test565) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 pixels565[256][2]); + + for (int i = 0; i < 256; ++i) { + for (int j = 0; j < 4; ++j) { + orig_pixels[i][j] = i; + } + } + ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1); + uint32 checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381); + EXPECT_EQ(610919429u, checksum); +} + +#ifdef HAVE_JPEG +TEST_F(libyuvTest, ValidateJpeg) { + const int kOff = 10; + const int kMinJpeg = 64; + const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? + benchmark_width_ * benchmark_height_ : kMinJpeg; + const int kSize = kImageSize + kOff; + align_buffer_64(orig_pixels, kSize); + + // No SOI or EOI. Expect fail. + memset(orig_pixels, 0, kSize); + + // EOI, SOI. Expect pass. + orig_pixels[0] = 0xff; + orig_pixels[1] = 0xd8; // SOI. + orig_pixels[kSize - kOff + 0] = 0xff; + orig_pixels[kSize - kOff + 1] = 0xd9; // EOI. + for (int times = 0; times < benchmark_iterations_; ++times) { + EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize)); + } + free_aligned_buffer_page_end(orig_pixels); +} + +TEST_F(libyuvTest, InvalidateJpeg) { + const int kOff = 10; + const int kMinJpeg = 64; + const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? + benchmark_width_ * benchmark_height_ : kMinJpeg; + const int kSize = kImageSize + kOff; + align_buffer_64(orig_pixels, kSize); + + // No SOI or EOI. Expect fail. + memset(orig_pixels, 0, kSize); + EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); + + // SOI but no EOI. Expect fail. + orig_pixels[0] = 0xff; + orig_pixels[1] = 0xd8; // SOI. + for (int times = 0; times < benchmark_iterations_; ++times) { + EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); + } + // EOI but no SOI. Expect fail. + orig_pixels[0] = 0; + orig_pixels[1] = 0; + orig_pixels[kSize - kOff + 0] = 0xff; + orig_pixels[kSize - kOff + 1] = 0xd9; // EOI. + EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); + + free_aligned_buffer_page_end(orig_pixels); +} + +#endif + +} // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/cpu_test.cc b/chromium/third_party/libyuv/unit_test/cpu_test.cc index 67c489cfc93..45579b8913e 100644 --- a/chromium/third_party/libyuv/unit_test/cpu_test.cc +++ b/chromium/third_party/libyuv/unit_test/cpu_test.cc @@ -41,6 +41,8 @@ TEST_F(libyuvTest, TestCpuHas) { printf("Has AVX2 %x\n", has_avx2); int has_erms = TestCpuFlag(kCpuHasERMS); printf("Has ERMS %x\n", has_erms); + int has_fma3 = TestCpuFlag(kCpuHasFMA3); + printf("Has FMA3 %x\n", has_fma3); int has_mips = TestCpuFlag(kCpuHasMIPS); printf("Has MIPS %x\n", has_mips); int has_mips_dsp = TestCpuFlag(kCpuHasMIPS_DSP); @@ -54,7 +56,7 @@ TEST_F(libyuvTest, TestCpuHas) { TEST_F(libyuvTest, TestCpuId) { int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { - int cpu_info[4]; + uint32 cpu_info[4]; // Vendor ID: // AuthenticAMD AMD processor // CentaurHauls Centaur processor @@ -66,7 +68,7 @@ TEST_F(libyuvTest, TestCpuId) { // RiseRiseRise Rise Technology processor // SiS SiS SiS SiS processor // UMC UMC UMC UMC processor - CpuId(cpu_info, 0); + CpuId(0, 0, cpu_info); cpu_info[0] = cpu_info[1]; // Reorder output cpu_info[1] = cpu_info[3]; cpu_info[3] = 0; @@ -81,7 +83,7 @@ TEST_F(libyuvTest, TestCpuId) { // 13:12 - Processor Type // 19:16 - Extended Model // 27:20 - Extended Family - CpuId(cpu_info, 1); + CpuId(1, 0, cpu_info); int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, @@ -93,10 +95,8 @@ TEST_F(libyuvTest, TestCpuId) { TEST_F(libyuvTest, TestLinuxNeon) { int testdata = ArmCpuCaps("unit_test/testdata/arm_v7.txt"); if (testdata) { - EXPECT_EQ(0, - ArmCpuCaps("unit_test/testdata/arm_v7.txt")); - EXPECT_EQ(kCpuHasNEON, - ArmCpuCaps("unit_test/testdata/tegra3.txt")); + EXPECT_EQ(0, ArmCpuCaps("unit_test/testdata/arm_v7.txt")); + EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("unit_test/testdata/tegra3.txt")); } else { printf("WARNING: unable to load \"unit_test/testdata/arm_v7.txt\"\n"); } diff --git a/chromium/third_party/libyuv/unit_test/math_test.cc b/chromium/third_party/libyuv/unit_test/math_test.cc new file mode 100644 index 00000000000..4095c122eb6 --- /dev/null +++ b/chromium/third_party/libyuv/unit_test/math_test.cc @@ -0,0 +1,114 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> +#include <string.h> + +#include "libyuv/basic_types.h" +#include "libyuv/cpu_id.h" +#include "libyuv/row.h" +#include "../unit_test/unit_test.h" + +namespace libyuv { + +TEST_F(libyuvTest, TestFixedDiv) { + int num[256]; + int div[256]; + int result_opt[256]; + int result_c[256]; + + EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640)); + EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640)); + EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640)); + EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640)); + EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640)); + EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640)); + EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640)); + EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640)); + EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960)); + EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640)); + EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640)); + EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080)); + EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000)); + EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000)); + EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000)); + EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000)); + EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000)); + EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000)); + EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095)); + EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096)); + EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097)); + EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1)); + + for (int i = 1; i < 4100; ++i) { + EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i)); + EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i)); + EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i)); + EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i)); + EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2)); + EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1); + } + EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1)); + + srandom(time(NULL)); + MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num)); + MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div)); + for (int j = 0; j < 256; ++j) { + if (div[j] == 0) { + div[j] = 1280; + } + } + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + for (int j = 0; j < 256; ++j) { + result_opt[j] = libyuv::FixedDiv(num[j], div[j]); + } + } + for (int j = 0; j < 256; ++j) { + result_c[j] = libyuv::FixedDiv_C(num[j], div[j]); + EXPECT_NEAR(result_c[j], result_opt[j], 1); + } +} + +TEST_F(libyuvTest, TestFixedDiv_Opt) { + int num[256]; + int div[256]; + int result_opt[256]; + int result_c[256]; + + srandom(time(NULL)); + MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num)); + MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div)); + for (int j = 0; j < 256; ++j) { + num[j] &= 4095; // Make numerator smaller. + div[j] &= 4095; // Make divisor smaller. + if (div[j] == 0) { + div[j] = 1280; + } + } + + int has_x86 = TestCpuFlag(kCpuHasX86); + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + if (has_x86) { + for (int j = 0; j < 256; ++j) { + result_opt[j] = libyuv::FixedDiv(num[j], div[j]); + } + } else { + for (int j = 0; j < 256; ++j) { + result_opt[j] = libyuv::FixedDiv_C(num[j], div[j]); + } + } + } + for (int j = 0; j < 256; ++j) { + result_c[j] = libyuv::FixedDiv_C(num[j], div[j]); + EXPECT_NEAR(result_c[j], result_opt[j], 1); + } +} + +} // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/planar_test.cc b/chromium/third_party/libyuv/unit_test/planar_test.cc index 2c9958baae1..7759db406ff 100644 --- a/chromium/third_party/libyuv/unit_test/planar_test.cc +++ b/chromium/third_party/libyuv/unit_test/planar_test.cc @@ -32,77 +32,83 @@ namespace libyuv { TEST_F(libyuvTest, TestAttenuate) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); - SIMD_ALIGNED(uint8 atten_pixels[256][4]); - SIMD_ALIGNED(uint8 unatten_pixels[256][4]); - SIMD_ALIGNED(uint8 atten2_pixels[256][4]); + const int kSize = 1280 * 4; + align_buffer_64(orig_pixels, kSize); + align_buffer_64(atten_pixels, kSize); + align_buffer_64(unatten_pixels, kSize); + align_buffer_64(atten2_pixels, kSize); // Test unattenuation clamps - orig_pixels[0][0] = 200u; - orig_pixels[0][1] = 129u; - orig_pixels[0][2] = 127u; - orig_pixels[0][3] = 128u; + orig_pixels[0 * 4 + 0] = 200u; + orig_pixels[0 * 4 + 1] = 129u; + orig_pixels[0 * 4 + 2] = 127u; + orig_pixels[0 * 4 + 3] = 128u; // Test unattenuation transparent and opaque are unaffected - orig_pixels[1][0] = 16u; - orig_pixels[1][1] = 64u; - orig_pixels[1][2] = 192u; - orig_pixels[1][3] = 0u; - orig_pixels[2][0] = 16u; - orig_pixels[2][1] = 64u; - orig_pixels[2][2] = 192u; - orig_pixels[2][3] = 255u; - orig_pixels[3][0] = 16u; - orig_pixels[3][1] = 64u; - orig_pixels[3][2] = 192u; - orig_pixels[3][3] = 128u; - ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1); - EXPECT_EQ(255u, unatten_pixels[0][0]); - EXPECT_EQ(255u, unatten_pixels[0][1]); - EXPECT_EQ(254u, unatten_pixels[0][2]); - EXPECT_EQ(128u, unatten_pixels[0][3]); - EXPECT_EQ(0u, unatten_pixels[1][0]); - EXPECT_EQ(0u, unatten_pixels[1][1]); - EXPECT_EQ(0u, unatten_pixels[1][2]); - EXPECT_EQ(0u, unatten_pixels[1][3]); - EXPECT_EQ(16u, unatten_pixels[2][0]); - EXPECT_EQ(64u, unatten_pixels[2][1]); - EXPECT_EQ(192u, unatten_pixels[2][2]); - EXPECT_EQ(255u, unatten_pixels[2][3]); - EXPECT_EQ(32u, unatten_pixels[3][0]); - EXPECT_EQ(128u, unatten_pixels[3][1]); - EXPECT_EQ(255u, unatten_pixels[3][2]); - EXPECT_EQ(128u, unatten_pixels[3][3]); - - for (int i = 0; i < 256; ++i) { - orig_pixels[i][0] = i; - orig_pixels[i][1] = i / 2; - orig_pixels[i][2] = i / 3; - orig_pixels[i][3] = i; - } - ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1); - ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1); - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1); - } - for (int i = 0; i < 256; ++i) { - EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2); - EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2); - EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2); - EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2); + orig_pixels[1 * 4 + 0] = 16u; + orig_pixels[1 * 4 + 1] = 64u; + orig_pixels[1 * 4 + 2] = 192u; + orig_pixels[1 * 4 + 3] = 0u; + orig_pixels[2 * 4 + 0] = 16u; + orig_pixels[2 * 4 + 1] = 64u; + orig_pixels[2 * 4 + 2] = 192u; + orig_pixels[2 * 4 + 3] = 255u; + orig_pixels[3 * 4 + 0] = 16u; + orig_pixels[3 * 4 + 1] = 64u; + orig_pixels[3 * 4 + 2] = 192u; + orig_pixels[3 * 4 + 3] = 128u; + ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1); + EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]); + EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]); + EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]); + EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]); + EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]); + EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]); + EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]); + EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]); + EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]); + EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]); + EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]); + EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]); + EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]); + EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]); + EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]); + EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]); + + for (int i = 0; i < 1280; ++i) { + orig_pixels[i * 4 + 0] = i; + orig_pixels[i * 4 + 1] = i / 2; + orig_pixels[i * 4 + 2] = i / 3; + orig_pixels[i * 4 + 3] = i; + } + ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 1280, 1); + ARGBUnattenuate(atten_pixels, 0, unatten_pixels, 0, 1280, 1); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1); + } + for (int i = 0; i < 1280; ++i) { + EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2); + EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2); + EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2); + EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2); } // Make sure transparent, 50% and opaque are fully accurate. - EXPECT_EQ(0, atten_pixels[0][0]); - EXPECT_EQ(0, atten_pixels[0][1]); - EXPECT_EQ(0, atten_pixels[0][2]); - EXPECT_EQ(0, atten_pixels[0][3]); - EXPECT_EQ(64, atten_pixels[128][0]); - EXPECT_EQ(32, atten_pixels[128][1]); - EXPECT_EQ(21, atten_pixels[128][2]); - EXPECT_EQ(128, atten_pixels[128][3]); - EXPECT_NEAR(255, atten_pixels[255][0], 1); - EXPECT_NEAR(127, atten_pixels[255][1], 1); - EXPECT_NEAR(85, atten_pixels[255][2], 1); - EXPECT_EQ(255, atten_pixels[255][3]); + EXPECT_EQ(0, atten_pixels[0 * 4 + 0]); + EXPECT_EQ(0, atten_pixels[0 * 4 + 1]); + EXPECT_EQ(0, atten_pixels[0 * 4 + 2]); + EXPECT_EQ(0, atten_pixels[0 * 4 + 3]); + EXPECT_EQ(64, atten_pixels[128 * 4 + 0]); + EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); + EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); + EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); + EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1); + EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1); + EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1); + EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); + + free_aligned_buffer_64(atten2_pixels) + free_aligned_buffer_64(unatten_pixels) + free_aligned_buffer_64(atten_pixels) + free_aligned_buffer_64(orig_pixels) } static int TestAttenuateI(int width, int height, int benchmark_iterations, @@ -268,7 +274,9 @@ TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { } TEST_F(libyuvTest, TestARGBGray) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); + // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; @@ -325,20 +333,22 @@ TEST_F(libyuvTest, TestARGBGray) { EXPECT_EQ(96u, orig_pixels[5][1]); EXPECT_EQ(96u, orig_pixels[5][2]); EXPECT_EQ(224u, orig_pixels[5][3]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBGray(&orig_pixels[0][0], 0, 0, 0, 1280, 1); } } TEST_F(libyuvTest, TestARGBGrayTo) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); - SIMD_ALIGNED(uint8 gray_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 gray_pixels[1280][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); + // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; @@ -395,19 +405,20 @@ TEST_F(libyuvTest, TestARGBGrayTo) { EXPECT_EQ(96u, gray_pixels[5][1]); EXPECT_EQ(96u, gray_pixels[5][2]); EXPECT_EQ(224u, gray_pixels[5][3]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 256, 1); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1); } } TEST_F(libyuvTest, TestARGBSepia) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); // Test blue orig_pixels[0][0] = 255u; @@ -466,27 +477,106 @@ TEST_F(libyuvTest, TestARGBSepia) { EXPECT_EQ(127u, orig_pixels[5][2]); EXPECT_EQ(224u, orig_pixels[5][3]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 1280, 1); } } TEST_F(libyuvTest, TestARGBColorMatrix) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]); // Matrix for Sepia. - static const int8 kARGBToSepia[] = { + SIMD_ALIGNED(static const int8 kRGBToSepia[]) = { + 17 / 2, 68 / 2, 35 / 2, 0, + 22 / 2, 88 / 2, 45 / 2, 0, + 24 / 2, 98 / 2, 50 / 2, 0, + 0, 0, 0, 64, // Copy alpha. + }; + memset(orig_pixels, 0, sizeof(orig_pixels)); + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + &kRGBToSepia[0], 16, 1); + EXPECT_EQ(31u, dst_pixels_opt[0][0]); + EXPECT_EQ(43u, dst_pixels_opt[0][1]); + EXPECT_EQ(47u, dst_pixels_opt[0][2]); + EXPECT_EQ(128u, dst_pixels_opt[0][3]); + EXPECT_EQ(135u, dst_pixels_opt[1][0]); + EXPECT_EQ(175u, dst_pixels_opt[1][1]); + EXPECT_EQ(195u, dst_pixels_opt[1][2]); + EXPECT_EQ(0u, dst_pixels_opt[1][3]); + EXPECT_EQ(67u, dst_pixels_opt[2][0]); + EXPECT_EQ(87u, dst_pixels_opt[2][1]); + EXPECT_EQ(99u, dst_pixels_opt[2][2]); + EXPECT_EQ(255u, dst_pixels_opt[2][3]); + EXPECT_EQ(87u, dst_pixels_opt[3][0]); + EXPECT_EQ(112u, dst_pixels_opt[3][1]); + EXPECT_EQ(127u, dst_pixels_opt[3][2]); + EXPECT_EQ(224u, dst_pixels_opt[3][3]); + + for (int i = 0; i < 1280; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + MaskCpuFlags(0); + ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0, + &kRGBToSepia[0], 1280, 1); + MaskCpuFlags(-1); + + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + &kRGBToSepia[0], 1280, 1); + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]); + EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]); + EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]); + EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]); + } +} + +TEST_F(libyuvTest, TestRGBColorMatrix) { + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + + // Matrix for Sepia. + SIMD_ALIGNED(static const int8 kRGBToSepia[]) = { 17, 68, 35, 0, 22, 88, 45, 0, 24, 98, 50, 0, 0, 0, 0, 0, // Unused but makes matrix 16 bytes. }; + memset(orig_pixels, 0, sizeof(orig_pixels)); // Test blue orig_pixels[0][0] = 255u; @@ -509,8 +599,8 @@ TEST_F(libyuvTest, TestARGBColorMatrix) { orig_pixels[3][2] = 192u; orig_pixels[3][3] = 224u; // Do 16 to test asm version. - ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1); - EXPECT_EQ(33u, orig_pixels[0][0]); + RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1); + EXPECT_EQ(31u, orig_pixels[0][0]); EXPECT_EQ(43u, orig_pixels[0][1]); EXPECT_EQ(47u, orig_pixels[0][2]); EXPECT_EQ(128u, orig_pixels[0][3]); @@ -518,28 +608,28 @@ TEST_F(libyuvTest, TestARGBColorMatrix) { EXPECT_EQ(175u, orig_pixels[1][1]); EXPECT_EQ(195u, orig_pixels[1][2]); EXPECT_EQ(0u, orig_pixels[1][3]); - EXPECT_EQ(69u, orig_pixels[2][0]); - EXPECT_EQ(89u, orig_pixels[2][1]); + EXPECT_EQ(67u, orig_pixels[2][0]); + EXPECT_EQ(87u, orig_pixels[2][1]); EXPECT_EQ(99u, orig_pixels[2][2]); EXPECT_EQ(255u, orig_pixels[2][3]); - EXPECT_EQ(88u, orig_pixels[3][0]); - EXPECT_EQ(114u, orig_pixels[3][1]); + EXPECT_EQ(87u, orig_pixels[3][0]); + EXPECT_EQ(112u, orig_pixels[3][1]); EXPECT_EQ(127u, orig_pixels[3][2]); EXPECT_EQ(224u, orig_pixels[3][3]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 256, 1); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1); } } TEST_F(libyuvTest, TestARGBColorTable) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); // Matrix for Sepia. @@ -585,67 +675,127 @@ TEST_F(libyuvTest, TestARGBColorTable) { EXPECT_EQ(11u, orig_pixels[3][2]); EXPECT_EQ(16u, orig_pixels[3][3]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1); + } +} + +// Same as TestARGBColorTable except alpha does not change. +TEST_F(libyuvTest, TestRGBColorTable) { + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); + + // Matrix for Sepia. + static const uint8 kARGBTable[256 * 4] = { + 1u, 2u, 3u, 4u, + 5u, 6u, 7u, 8u, + 9u, 10u, 11u, 12u, + 13u, 14u, 15u, 16u, + }; + + orig_pixels[0][0] = 0u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 0u; + orig_pixels[1][0] = 1u; + orig_pixels[1][1] = 1u; + orig_pixels[1][2] = 1u; + orig_pixels[1][3] = 1u; + orig_pixels[2][0] = 2u; + orig_pixels[2][1] = 2u; + orig_pixels[2][2] = 2u; + orig_pixels[2][3] = 2u; + orig_pixels[3][0] = 0u; + orig_pixels[3][1] = 1u; + orig_pixels[3][2] = 2u; + orig_pixels[3][3] = 3u; + // Do 16 to test asm version. + RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1); + EXPECT_EQ(1u, orig_pixels[0][0]); + EXPECT_EQ(2u, orig_pixels[0][1]); + EXPECT_EQ(3u, orig_pixels[0][2]); + EXPECT_EQ(0u, orig_pixels[0][3]); // Alpha unchanged. + EXPECT_EQ(5u, orig_pixels[1][0]); + EXPECT_EQ(6u, orig_pixels[1][1]); + EXPECT_EQ(7u, orig_pixels[1][2]); + EXPECT_EQ(1u, orig_pixels[1][3]); // Alpha unchanged. + EXPECT_EQ(9u, orig_pixels[2][0]); + EXPECT_EQ(10u, orig_pixels[2][1]); + EXPECT_EQ(11u, orig_pixels[2][2]); + EXPECT_EQ(2u, orig_pixels[2][3]); // Alpha unchanged. + EXPECT_EQ(1u, orig_pixels[3][0]); + EXPECT_EQ(6u, orig_pixels[3][1]); + EXPECT_EQ(11u, orig_pixels[3][2]); + EXPECT_EQ(3u, orig_pixels[3][3]); // Alpha unchanged. + + for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 256, 1); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1); } } TEST_F(libyuvTest, TestARGBQuantize) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } ARGBQuantize(&orig_pixels[0][0], 0, - (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1); + (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1); - for (int i = 0; i < 256; ++i) { - EXPECT_EQ(i / 8 * 8 + 8 / 2, orig_pixels[i][0]); - EXPECT_EQ(i / 2 / 8 * 8 + 8 / 2, orig_pixels[i][1]); - EXPECT_EQ(i / 3 / 8 * 8 + 8 / 2, orig_pixels[i][2]); - EXPECT_EQ(i, orig_pixels[i][3]); + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]); + EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]); + EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]); + EXPECT_EQ(i & 255, orig_pixels[i][3]); } - for (int i = 0; i < benchmark_pixels_div256_; ++i) { + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBQuantize(&orig_pixels[0][0], 0, - (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 256, 1); + (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1); } } TEST_F(libyuvTest, TestARGBMirror) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); - SIMD_ALIGNED(uint8 dst_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels[1280][4]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i / 4; } - ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1); + ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1); - for (int i = 0; i < 256; ++i) { - EXPECT_EQ(i, dst_pixels[255 - i][0]); - EXPECT_EQ(i / 2, dst_pixels[255 - i][1]); - EXPECT_EQ(i / 3, dst_pixels[255 - i][2]); - EXPECT_EQ(i / 4, dst_pixels[255 - i][3]); + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]); + EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]); + EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]); + EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]); } - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 256, 1); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1); } } TEST_F(libyuvTest, TestShade) { - SIMD_ALIGNED(uint8 orig_pixels[256][4]); - SIMD_ALIGNED(uint8 shade_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 shade_pixels[1280][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); orig_pixels[0][0] = 10u; orig_pixels[0][1] = 20u; @@ -694,16 +844,18 @@ TEST_F(libyuvTest, TestShade) { EXPECT_EQ(5u, shade_pixels[0][2]); EXPECT_EQ(5u, shade_pixels[0][3]); - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1, + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1, 0x80808080); } } TEST_F(libyuvTest, TestInterpolate) { - SIMD_ALIGNED(uint8 orig_pixels_0[256][4]); - SIMD_ALIGNED(uint8 orig_pixels_1[256][4]); - SIMD_ALIGNED(uint8 interpolate_pixels[256][4]); + SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]); + SIMD_ALIGNED(uint8 orig_pixels_1[1280][4]); + SIMD_ALIGNED(uint8 interpolate_pixels[1280][4]); + memset(orig_pixels_0, 0, sizeof(orig_pixels_0)); + memset(orig_pixels_1, 0, sizeof(orig_pixels_1)); orig_pixels_0[0][0] = 16u; orig_pixels_0[0][1] = 32u; @@ -773,9 +925,9 @@ TEST_F(libyuvTest, TestInterpolate) { EXPECT_EQ(16u, interpolate_pixels[0][2]); EXPECT_EQ(32u, interpolate_pixels[0][3]); - for (int i = 0; i < benchmark_pixels_div256_; ++i) { + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, - &interpolate_pixels[0][0], 0, 256, 1, 128); + &interpolate_pixels[0][0], 0, 1280, 1, 128); } } @@ -841,7 +993,6 @@ TESTINTERPOLATE(64) TESTINTERPOLATE(128) TESTINTERPOLATE(192) TESTINTERPOLATE(255) -TESTINTERPOLATE(85) static int TestBlend(int width, int height, int benchmark_iterations, int invert, int off) { @@ -919,10 +1070,10 @@ TEST_F(libyuvTest, ARGBBlend_Opt) { } TEST_F(libyuvTest, TestAffine) { - SIMD_ALIGNED(uint8 orig_pixels_0[256][4]); - SIMD_ALIGNED(uint8 interpolate_pixels_C[256][4]); + SIMD_ALIGNED(uint8 orig_pixels_0[1280][4]); + SIMD_ALIGNED(uint8 interpolate_pixels_C[1280][4]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { for (int j = 0; j < 4; ++j) { orig_pixels_0[i][j] = i; } @@ -931,42 +1082,42 @@ TEST_F(libyuvTest, TestAffine) { float uv_step[4] = { 0.f, 0.f, 0.75f, 0.f }; ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], - uv_step, 256); + uv_step, 1280); EXPECT_EQ(0u, interpolate_pixels_C[0][0]); EXPECT_EQ(96u, interpolate_pixels_C[128][0]); EXPECT_EQ(191u, interpolate_pixels_C[255][3]); #if defined(HAS_ARGBAFFINEROW_SSE2) - SIMD_ALIGNED(uint8 interpolate_pixels_Opt[256][4]); + SIMD_ALIGNED(uint8 interpolate_pixels_Opt[1280][4]); ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], - uv_step, 256); - EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 256 * 4)); + uv_step, 1280); + EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4)); int has_sse2 = TestCpuFlag(kCpuHasSSE2); if (has_sse2) { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], - uv_step, 256); + uv_step, 1280); } } #endif } TEST_F(libyuvTest, TestSobelX) { - SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]); - SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]); - SIMD_ALIGNED(uint8 orig_pixels_2[256 + 2]); - SIMD_ALIGNED(uint8 sobel_pixels_c[256]); - SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); + SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]); + SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]); + SIMD_ALIGNED(uint8 orig_pixels_2[1280 + 2]); + SIMD_ALIGNED(uint8 sobel_pixels_c[1280]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]); - for (int i = 0; i < 256 + 2; ++i) { + for (int i = 0; i < 1280 + 2; ++i) { orig_pixels_0[i] = i; orig_pixels_1[i] = i * 2; orig_pixels_2[i] = i * 3; } SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2, - sobel_pixels_c, 256); + sobel_pixels_c, 1280); EXPECT_EQ(16u, sobel_pixels_c[0]); EXPECT_EQ(16u, sobel_pixels_c[100]); @@ -975,9 +1126,9 @@ TEST_F(libyuvTest, TestSobelX) { void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobely, int width) = SobelXRow_C; -#if defined(HAS_SOBELXROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SobelXRow = SobelXRow_SSSE3; +#if defined(HAS_SOBELXROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXRow = SobelXRow_SSE2; } #endif #if defined(HAS_SOBELXROW_NEON) @@ -985,36 +1136,36 @@ TEST_F(libyuvTest, TestSobelX) { SobelXRow = SobelXRow_NEON; } #endif - for (int i = 0; i < benchmark_pixels_div256_; ++i) { + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2, - sobel_pixels_opt, 256); + sobel_pixels_opt, 1280); } - for (int i = 0; i < 256; ++i) { - EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]); } } TEST_F(libyuvTest, TestSobelY) { - SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]); - SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]); - SIMD_ALIGNED(uint8 sobel_pixels_c[256]); - SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); + SIMD_ALIGNED(uint8 orig_pixels_0[1280 + 2]); + SIMD_ALIGNED(uint8 orig_pixels_1[1280 + 2]); + SIMD_ALIGNED(uint8 sobel_pixels_c[1280]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]); - for (int i = 0; i < 256 + 2; ++i) { + for (int i = 0; i < 1280 + 2; ++i) { orig_pixels_0[i] = i; orig_pixels_1[i] = i * 2; } - SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 256); + SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 1280); EXPECT_EQ(4u, sobel_pixels_c[0]); EXPECT_EQ(255u, sobel_pixels_c[100]); EXPECT_EQ(0u, sobel_pixels_c[255]); void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) = SobelYRow_C; -#if defined(HAS_SOBELYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - SobelYRow = SobelYRow_SSSE3; +#if defined(HAS_SOBELYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelYRow = SobelYRow_SSE2; } #endif #if defined(HAS_SOBELYROW_NEON) @@ -1022,26 +1173,26 @@ TEST_F(libyuvTest, TestSobelY) { SobelYRow = SobelYRow_NEON; } #endif - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 1280); } - for (int i = 0; i < 256; ++i) { - EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]); } } TEST_F(libyuvTest, TestSobel) { - SIMD_ALIGNED(uint8 orig_sobelx[256]); - SIMD_ALIGNED(uint8 orig_sobely[256]); - SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]); - SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]); + SIMD_ALIGNED(uint8 orig_sobelx[1280]); + SIMD_ALIGNED(uint8 orig_sobely[1280]); + SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_sobelx[i] = i; orig_sobely[i] = i * 2; } - SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256); + SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280); EXPECT_EQ(0u, sobel_pixels_c[0]); EXPECT_EQ(3u, sobel_pixels_c[4]); @@ -1066,26 +1217,64 @@ TEST_F(libyuvTest, TestSobel) { SobelRow = SobelRow_NEON; } #endif - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280); } - for (int i = 0; i < 16; ++i) { - EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); + for (int i = 0; i < 1280 * 4; ++i) { + EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]); + } +} + +TEST_F(libyuvTest, TestSobelToPlane) { + SIMD_ALIGNED(uint8 orig_sobelx[1280]); + SIMD_ALIGNED(uint8 orig_sobely[1280]); + SIMD_ALIGNED(uint8 sobel_pixels_c[1280]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[1280]); + + for (int i = 0; i < 1280; ++i) { + orig_sobelx[i] = i; + orig_sobely[i] = i * 2; + } + + SobelToPlaneRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280); + + EXPECT_EQ(0u, sobel_pixels_c[0]); + EXPECT_EQ(3u, sobel_pixels_c[1]); + EXPECT_EQ(6u, sobel_pixels_c[2]); + EXPECT_EQ(99u, sobel_pixels_c[33]); + EXPECT_EQ(255u, sobel_pixels_c[100]); + void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width) = SobelToPlaneRow_C; +#if defined(HAS_SOBELTOPLANEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } +#endif +#if defined(HAS_SOBELTOPLANEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } +#endif + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + SobelToPlaneRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280); + } + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]); } } TEST_F(libyuvTest, TestSobelXY) { - SIMD_ALIGNED(uint8 orig_sobelx[256]); - SIMD_ALIGNED(uint8 orig_sobely[256]); - SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]); - SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]); + SIMD_ALIGNED(uint8 orig_sobelx[1280]); + SIMD_ALIGNED(uint8 orig_sobely[1280]); + SIMD_ALIGNED(uint8 sobel_pixels_c[1280 * 4]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[1280 * 4]); - for (int i = 0; i < 256; ++i) { + for (int i = 0; i < 1280; ++i) { orig_sobelx[i] = i; orig_sobely[i] = i * 2; } - SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 256); + SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_c, 1280); EXPECT_EQ(0u, sobel_pixels_c[0]); EXPECT_EQ(2u, sobel_pixels_c[4]); @@ -1106,11 +1295,11 @@ TEST_F(libyuvTest, TestSobelXY) { SobelXYRow = SobelXYRow_NEON; } #endif - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 1280); } - for (int i = 0; i < 16; ++i) { - EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); + for (int i = 0; i < 1280 * 4; ++i) { + EXPECT_EQ(sobel_pixels_c[i], sobel_pixels_opt[i]); } } @@ -1165,8 +1354,6 @@ TEST_F(libyuvTest, TestCopyPlane) { CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh); } opt_time = (get_time() - opt_time) / benchmark_iterations_; - printf(" %8d us C - %8d us OPT\n", - static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6)); for (i = 0; i < y_plane_size; ++i) { if (dst_c[i] != dst_opt[i]) @@ -1403,6 +1590,7 @@ static int TestSobel(int width, int height, int benchmark_iterations, align_buffer_64(src_argb_a, kStride * height + off); align_buffer_64(dst_argb_c, kStride * height); align_buffer_64(dst_argb_opt, kStride * height); + memset(src_argb_a, 0, kStride * height + off); srandom(time(NULL)); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (random() & 0xff); @@ -1459,6 +1647,75 @@ TEST_F(libyuvTest, ARGBSobel_Opt) { EXPECT_EQ(0, max_diff); } +static int TestSobelToPlane(int width, int height, int benchmark_iterations, + int invert, int off) { + if (width < 1) { + width = 1; + } + const int kSrcBpp = 4; + const int kDstBpp = 1; + const int kSrcStride = (width * kSrcBpp + 15) & ~15; + const int kDstStride = (width * kDstBpp + 15) & ~15; + align_buffer_64(src_argb_a, kSrcStride * height + off); + align_buffer_64(dst_argb_c, kDstStride * height); + align_buffer_64(dst_argb_opt, kDstStride * height); + memset(src_argb_a, 0, kSrcStride * height + off); + srandom(time(NULL)); + for (int i = 0; i < kSrcStride * height; ++i) { + src_argb_a[i + off] = (random() & 0xff); + } + memset(dst_argb_c, 0, kDstStride * height); + memset(dst_argb_opt, 0, kDstStride * height); + + MaskCpuFlags(0); + ARGBSobelToPlane(src_argb_a + off, kSrcStride, + dst_argb_c, kDstStride, + width, invert * height); + MaskCpuFlags(-1); + for (int i = 0; i < benchmark_iterations; ++i) { + ARGBSobelToPlane(src_argb_a + off, kSrcStride, + dst_argb_opt, kDstStride, + width, invert * height); + } + int max_diff = 0; + for (int i = 0; i < kDstStride * height; ++i) { + int abs_diff = + abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + free_aligned_buffer_64(src_argb_a) + free_aligned_buffer_64(dst_argb_c) + free_aligned_buffer_64(dst_argb_opt) + return max_diff; +} + +TEST_F(libyuvTest, ARGBSobelToPlane_Any) { + int max_diff = TestSobelToPlane(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_EQ(0, max_diff); +} + +TEST_F(libyuvTest, ARGBSobelToPlane_Unaligned) { + int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1); + EXPECT_EQ(0, max_diff); +} + +TEST_F(libyuvTest, ARGBSobelToPlane_Invert) { + int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0); + EXPECT_EQ(0, max_diff); +} + +TEST_F(libyuvTest, ARGBSobelToPlane_Opt) { + int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_EQ(0, max_diff); +} + static int TestSobelXY(int width, int height, int benchmark_iterations, int invert, int off) { if (width < 1) { @@ -1469,6 +1726,7 @@ static int TestSobelXY(int width, int height, int benchmark_iterations, align_buffer_64(src_argb_a, kStride * height + off); align_buffer_64(dst_argb_c, kStride * height); align_buffer_64(dst_argb_opt, kStride * height); + memset(src_argb_a, 0, kStride * height + off); srandom(time(NULL)); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (random() & 0xff); @@ -1525,4 +1783,326 @@ TEST_F(libyuvTest, ARGBSobelXY_Opt) { EXPECT_EQ(0, max_diff); } +static int TestBlur(int width, int height, int benchmark_iterations, + int invert, int off, int radius) { + if (width < 1) { + width = 1; + } + const int kBpp = 4; + const int kStride = (width * kBpp + 15) & ~15; + align_buffer_64(src_argb_a, kStride * height + off); + align_buffer_64(dst_cumsum, width * height * 16); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); + srandom(time(NULL)); + for (int i = 0; i < kStride * height; ++i) { + src_argb_a[i + off] = (random() & 0xff); + } + memset(dst_cumsum, 0, width * height * 16); + memset(dst_argb_c, 0, kStride * height); + memset(dst_argb_opt, 0, kStride * height); + + MaskCpuFlags(0); + ARGBBlur(src_argb_a + off, kStride, + dst_argb_c, kStride, + reinterpret_cast<int32*>(dst_cumsum), width * 4, + width, invert * height, radius); + MaskCpuFlags(-1); + for (int i = 0; i < benchmark_iterations; ++i) { + ARGBBlur(src_argb_a + off, kStride, + dst_argb_opt, kStride, + reinterpret_cast<int32*>(dst_cumsum), width * 4, + width, invert * height, radius); + } + int max_diff = 0; + for (int i = 0; i < kStride * height; ++i) { + int abs_diff = + abs(static_cast<int>(dst_argb_c[i]) - + static_cast<int>(dst_argb_opt[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + free_aligned_buffer_64(src_argb_a) + free_aligned_buffer_64(dst_cumsum) + free_aligned_buffer_64(dst_argb_c) + free_aligned_buffer_64(dst_argb_opt) + return max_diff; +} + +static const int kBlurSize = 55; +TEST_F(libyuvTest, ARGBBlur_Any) { + int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0, kBlurSize); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBBlur_Unaligned) { + int max_diff = TestBlur(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1, kBlurSize); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBBlur_Invert) { + int max_diff = TestBlur(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0, kBlurSize); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBBlur_Opt) { + int max_diff = TestBlur(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0, kBlurSize); + EXPECT_LE(max_diff, 1); +} + +static const int kBlurSmallSize = 5; +TEST_F(libyuvTest, ARGBBlurSmall_Any) { + int max_diff = TestBlur(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0, kBlurSmallSize); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBBlurSmall_Unaligned) { + int max_diff = TestBlur(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1, kBlurSmallSize); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBBlurSmall_Invert) { + int max_diff = TestBlur(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0, kBlurSmallSize); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, ARGBBlurSmall_Opt) { + int max_diff = TestBlur(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0, kBlurSmallSize); + EXPECT_LE(max_diff, 1); +} + +TEST_F(libyuvTest, TestARGBPolynomial) { + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); + + SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = { + 0.94230f, -3.03300f, -2.92500f, 0.f, // C0 + 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x + 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x + 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x + }; + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test white + orig_pixels[3][0] = 255u; + orig_pixels[3][1] = 255u; + orig_pixels[3][2] = 255u; + orig_pixels[3][3] = 255u; + // Test color + orig_pixels[4][0] = 16u; + orig_pixels[4][1] = 64u; + orig_pixels[4][2] = 192u; + orig_pixels[4][3] = 224u; + // Do 16 to test asm version. + ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + &kWarmifyPolynomial[0], 16, 1); + EXPECT_EQ(235u, dst_pixels_opt[0][0]); + EXPECT_EQ(0u, dst_pixels_opt[0][1]); + EXPECT_EQ(0u, dst_pixels_opt[0][2]); + EXPECT_EQ(128u, dst_pixels_opt[0][3]); + EXPECT_EQ(0u, dst_pixels_opt[1][0]); + EXPECT_EQ(233u, dst_pixels_opt[1][1]); + EXPECT_EQ(0u, dst_pixels_opt[1][2]); + EXPECT_EQ(0u, dst_pixels_opt[1][3]); + EXPECT_EQ(0u, dst_pixels_opt[2][0]); + EXPECT_EQ(0u, dst_pixels_opt[2][1]); + EXPECT_EQ(241u, dst_pixels_opt[2][2]); + EXPECT_EQ(255u, dst_pixels_opt[2][3]); + EXPECT_EQ(235u, dst_pixels_opt[3][0]); + EXPECT_EQ(233u, dst_pixels_opt[3][1]); + EXPECT_EQ(241u, dst_pixels_opt[3][2]); + EXPECT_EQ(255u, dst_pixels_opt[3][3]); + EXPECT_EQ(10u, dst_pixels_opt[4][0]); + EXPECT_EQ(59u, dst_pixels_opt[4][1]); + EXPECT_EQ(188u, dst_pixels_opt[4][2]); + EXPECT_EQ(224u, dst_pixels_opt[4][3]); + + for (int i = 0; i < 1280; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + MaskCpuFlags(0); + ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0, + &kWarmifyPolynomial[0], 1280, 1); + MaskCpuFlags(-1); + + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + &kWarmifyPolynomial[0], 1280, 1); + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]); + EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]); + EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]); + EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]); + } +} + +TEST_F(libyuvTest, TestARGBLumaColorTable) { + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]); + memset(orig_pixels, 0, sizeof(orig_pixels)); + + align_buffer_64(lumacolortable, 32768); + int v = 0; + for (int i = 0; i < 32768; ++i) { + lumacolortable[i] = v; + v += 3; + } + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + &lumacolortable[0], 16, 1); + EXPECT_EQ(253u, dst_pixels_opt[0][0]); + EXPECT_EQ(0u, dst_pixels_opt[0][1]); + EXPECT_EQ(0u, dst_pixels_opt[0][2]); + EXPECT_EQ(128u, dst_pixels_opt[0][3]); + EXPECT_EQ(0u, dst_pixels_opt[1][0]); + EXPECT_EQ(253u, dst_pixels_opt[1][1]); + EXPECT_EQ(0u, dst_pixels_opt[1][2]); + EXPECT_EQ(0u, dst_pixels_opt[1][3]); + EXPECT_EQ(0u, dst_pixels_opt[2][0]); + EXPECT_EQ(0u, dst_pixels_opt[2][1]); + EXPECT_EQ(253u, dst_pixels_opt[2][2]); + EXPECT_EQ(255u, dst_pixels_opt[2][3]); + EXPECT_EQ(48u, dst_pixels_opt[3][0]); + EXPECT_EQ(192u, dst_pixels_opt[3][1]); + EXPECT_EQ(64u, dst_pixels_opt[3][2]); + EXPECT_EQ(224u, dst_pixels_opt[3][3]); + + for (int i = 0; i < 1280; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + MaskCpuFlags(0); + ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0, + lumacolortable, 1280, 1); + MaskCpuFlags(-1); + + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + lumacolortable, 1280, 1); + } + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]); + EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]); + EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]); + EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]); + } + + free_aligned_buffer_64(lumacolortable); +} + +TEST_F(libyuvTest, TestARGBCopyAlpha) { + const int kSize = benchmark_width_ * benchmark_height_ * 4; + align_buffer_64(orig_pixels, kSize); + align_buffer_64(dst_pixels_opt, kSize); + align_buffer_64(dst_pixels_c, kSize); + + MemRandomize(orig_pixels, kSize); + MemRandomize(dst_pixels_opt, kSize); + memcpy(dst_pixels_c, dst_pixels_opt, kSize); + + MaskCpuFlags(0); + ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, + dst_pixels_c, benchmark_width_ * 4, + benchmark_width_, benchmark_height_); + MaskCpuFlags(-1); + + for (int i = 0; i < benchmark_iterations_; ++i) { + ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, + dst_pixels_opt, benchmark_width_ * 4, + benchmark_width_, benchmark_height_); + } + for (int i = 0; i < kSize; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + + free_aligned_buffer_64(dst_pixels_c) + free_aligned_buffer_64(dst_pixels_opt) + free_aligned_buffer_64(orig_pixels) +} + +TEST_F(libyuvTest, TestARGBCopyYToAlpha) { + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_64(orig_pixels, kPixels); + align_buffer_64(dst_pixels_opt, kPixels * 4); + align_buffer_64(dst_pixels_c, kPixels * 4); + + MemRandomize(orig_pixels, kPixels); + MemRandomize(dst_pixels_opt, kPixels * 4); + memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4); + + MaskCpuFlags(0); + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, + dst_pixels_c, benchmark_width_ * 4, + benchmark_width_, benchmark_height_); + MaskCpuFlags(-1); + + for (int i = 0; i < benchmark_iterations_; ++i) { + ARGBCopyYToAlpha(orig_pixels, benchmark_width_, + dst_pixels_opt, benchmark_width_ * 4, + benchmark_width_, benchmark_height_); + } + for (int i = 0; i < kPixels * 4; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + + free_aligned_buffer_64(dst_pixels_c) + free_aligned_buffer_64(dst_pixels_opt) + free_aligned_buffer_64(orig_pixels) +} + } // namespace libyuv diff --git a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc index 7a4758594a0..ea4d4d14580 100644 --- a/chromium/third_party/libyuv/unit_test/scale_argb_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_argb_test.cc @@ -17,10 +17,6 @@ namespace libyuv { -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int ARGBTestFilter(int src_width, int src_height, int dst_width, int dst_height, @@ -85,7 +81,7 @@ static int ARGBTestFilter(int src_width, int src_height, int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b * 4; j < (dst_width + b) * 4; ++j) { - int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] - + int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] - dst_argb_opt[(i * dst_stride_argb) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; @@ -99,8 +95,8 @@ static int ARGBTestFilter(int src_width, int src_height, return max_diff; } -static const int kTileX = 16; -static const int kTileY = 16; +static const int kTileX = 8; +static const int kTileY = 8; static int TileARGBScale(const uint8* src_argb, int src_stride_argb, int src_width, int src_height, @@ -184,7 +180,7 @@ static int ARGBClipTestFilter(int src_width, int src_height, int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b * 4; j < (dst_width + b) * 4; ++j) { - int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] - + int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] - dst_argb_opt[(i * dst_stride_argb) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; @@ -198,78 +194,83 @@ static int ARGBClipTestFilter(int src_width, int src_height, return max_diff; } -#define TEST_FACTOR1(name, filter, factor, max_diff) \ +#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff) \ TEST_F(libyuvTest, ARGBScaleDownBy##name##_##filter) { \ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \ - Abs(benchmark_width_) / factor, \ - Abs(benchmark_height_) / factor, \ + Abs(benchmark_width_) * hfactor, \ + Abs(benchmark_height_) * vfactor, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(libyuvTest, ARGBScaleDownClipBy##name##_##filter) { \ int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \ - Abs(benchmark_width_) / factor, \ - Abs(benchmark_height_) / factor, \ + Abs(benchmark_width_) * hfactor, \ + Abs(benchmark_height_) * vfactor, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } -// Test a scale factor with all 2 filters. Expect unfiltered to be exact, but +// Test a scale factor with 2 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, factor) \ - TEST_FACTOR1(name, None, factor, 0) \ - TEST_FACTOR1(name, Bilinear, factor, 2) +#define TEST_FACTOR(name, hfactor, vfactor) \ + TEST_FACTOR1(name, None, hfactor, vfactor, 2) \ + TEST_FACTOR1(name, Linear, hfactor, vfactor, 2) \ + TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 2) \ + TEST_FACTOR1(name, Box, hfactor, vfactor, 2) // TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2. -TEST_FACTOR(1, 1) -TEST_FACTOR(2, 2) -TEST_FACTOR(4, 4) -TEST_FACTOR(5, 5) -TEST_FACTOR(8, 8) -TEST_FACTOR(16, 16) -TEST_FACTOR(2by3, 2 / 3) -TEST_FACTOR(3by4, 3 / 4) -TEST_FACTOR(3by8, 3 / 8) +TEST_FACTOR(1, 1 / 1, 1 / 1) +TEST_FACTOR(2, 1 / 2, 1 / 2) +TEST_FACTOR(4, 1 / 4, 1 / 4) +TEST_FACTOR(8, 1 / 8, 1 / 8) +TEST_FACTOR(16, 1 / 16, 1 / 16) +TEST_FACTOR(2by3, 2 / 3, 2 / 3) +TEST_FACTOR(3by4, 3 / 4, 3 / 4) +TEST_FACTOR(3by8, 3 / 8, 3 / 8) +TEST_FACTOR(Vertical2by3, 1, 2 / 3) #undef TEST_FACTOR1 #undef TEST_FACTOR -#define TEST_SCALETO1(width, height, filter, max_diff) \ - TEST_F(libyuvTest, ARGBScaleTo##width##x##height##_##filter) { \ +#define TEST_SCALETO1(name, width, height, filter, max_diff) \ + TEST_F(libyuvTest, name##To##width##x##height##_##filter) { \ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \ width, height, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(libyuvTest, ARGBScaleFrom##width##x##height##_##filter) { \ + TEST_F(libyuvTest, name##From##width##x##height##_##filter) { \ int diff = ARGBTestFilter(width, height, \ Abs(benchmark_width_), Abs(benchmark_height_), \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(libyuvTest, ARGBScaleClipTo##width##x##height##_##filter) { \ + TEST_F(libyuvTest, name##ClipTo##width##x##height##_##filter) { \ int diff = ARGBClipTestFilter(benchmark_width_, benchmark_height_, \ width, height, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(libyuvTest, ARGBScaleClipFrom##width##x##height##_##filter) { \ + TEST_F(libyuvTest, name##ClipFrom##width##x##height##_##filter) { \ int diff = ARGBClipTestFilter(width, height, \ Abs(benchmark_width_), Abs(benchmark_height_), \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } -// Test scale to a specified size with all 3 filters. -#define TEST_SCALETO(width, height) \ - TEST_SCALETO1(width, height, None, 0) \ - TEST_SCALETO1(width, height, Bilinear, 2) - -TEST_SCALETO(640, 360) -TEST_SCALETO(853, 480) -TEST_SCALETO(1280, 720) -TEST_SCALETO(1280, 800) -TEST_SCALETO(1366, 768) -TEST_SCALETO(1920, 1080) +/// Test scale to a specified size with all 4 filters. +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(name, width, height, None, 0) \ + TEST_SCALETO1(name, width, height, Linear, 3) \ + TEST_SCALETO1(name, width, height, Bilinear, 3) \ + TEST_SCALETO1(name, width, height, Box, 3) + +TEST_SCALETO(ARGBScale, 1, 1) +TEST_SCALETO(ARGBScale, 320, 240) +TEST_SCALETO(ARGBScale, 352, 288) +TEST_SCALETO(ARGBScale, 640, 360) +TEST_SCALETO(ARGBScale, 853, 480) +TEST_SCALETO(ARGBScale, 1280, 720) +TEST_SCALETO(ARGBScale, 1920, 1080) #undef TEST_SCALETO1 #undef TEST_SCALETO diff --git a/chromium/third_party/libyuv/unit_test/scale_test.cc b/chromium/third_party/libyuv/unit_test/scale_test.cc index 769151aa232..c6f25604608 100644 --- a/chromium/third_party/libyuv/unit_test/scale_test.cc +++ b/chromium/third_party/libyuv/unit_test/scale_test.cc @@ -17,10 +17,6 @@ namespace libyuv { -static __inline int Abs(int v) { - return v >= 0 ? v : -v; -} - // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int TestFilter(int src_width, int src_height, int dst_width, int dst_height, @@ -99,7 +95,7 @@ static int TestFilter(int src_width, int src_height, int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b; j < (dst_width + b); ++j) { - int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] - + int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] - dst_y_opt[(i * dst_stride_y) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; @@ -109,12 +105,12 @@ static int TestFilter(int src_width, int src_height, for (i = b; i < (dst_height_uv + b); ++i) { for (j = b; j < (dst_width_uv + b); ++j) { - int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] - + int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] - dst_u_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } - abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] - + abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] - dst_v_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; @@ -136,61 +132,64 @@ static int TestFilter(int src_width, int src_height, return max_diff; } -#define TEST_FACTOR1(name, filter, factor, max_diff) \ +#define TEST_FACTOR1(name, filter, hfactor, vfactor, max_diff) \ TEST_F(libyuvTest, ScaleDownBy##name##_##filter) { \ int diff = TestFilter(benchmark_width_, benchmark_height_, \ - Abs(benchmark_width_) / factor, \ - Abs(benchmark_height_) / factor, \ + Abs(benchmark_width_) * hfactor, \ + Abs(benchmark_height_) * vfactor, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } -// Test a scale factor with all 3 filters. Expect unfiltered to be exact, but +// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. -#define TEST_FACTOR(name, factor) \ - TEST_FACTOR1(name, None, factor, 0) \ - TEST_FACTOR1(name, Bilinear, factor, 2) \ - TEST_FACTOR1(name, Box, factor, 2) \ +#define TEST_FACTOR(name, hfactor, vfactor) \ + TEST_FACTOR1(name, None, hfactor, vfactor, 0) \ + TEST_FACTOR1(name, Linear, hfactor, vfactor, 3) \ + TEST_FACTOR1(name, Bilinear, hfactor, vfactor, 3) \ + TEST_FACTOR1(name, Box, hfactor, vfactor, 3) \ // TODO(fbarchard): ScaleDownBy1 should be lossless, but Box has error of 2. -TEST_FACTOR(1, 1) -TEST_FACTOR(2, 2) -TEST_FACTOR(4, 4) -TEST_FACTOR(5, 5) -TEST_FACTOR(8, 8) -TEST_FACTOR(16, 16) -TEST_FACTOR(2by3, 2 / 3) -TEST_FACTOR(3by4, 3 / 4) -TEST_FACTOR(3by8, 3 / 8) +TEST_FACTOR(1, 1 / 1, 1 / 1) +TEST_FACTOR(2, 1 / 2, 1 / 2) +TEST_FACTOR(4, 1 / 4, 1 / 4) +TEST_FACTOR(8, 1 / 8, 1 / 8) +TEST_FACTOR(16, 1 / 16, 1 / 16) +TEST_FACTOR(2by3, 2 / 3, 2 / 3) +TEST_FACTOR(3by4, 3 / 4, 3 / 4) +TEST_FACTOR(3by8, 3 / 8, 3 / 8) +TEST_FACTOR(Vertical2by3, 1, 2 / 3) #undef TEST_FACTOR1 #undef TEST_FACTOR -#define TEST_SCALETO1(width, height, filter, max_diff) \ - TEST_F(libyuvTest, ScaleTo##width##x##height##_##filter) { \ +#define TEST_SCALETO1(name, width, height, filter, max_diff) \ + TEST_F(libyuvTest, name##To##width##x##height##_##filter) { \ int diff = TestFilter(benchmark_width_, benchmark_height_, \ width, height, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(libyuvTest, ScaleFrom##width##x##height##_##filter) { \ + TEST_F(libyuvTest, name##From##width##x##height##_##filter) { \ int diff = TestFilter(width, height, \ Abs(benchmark_width_), Abs(benchmark_height_), \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } -// Test scale to a specified size with all 3 filters. -#define TEST_SCALETO(width, height) \ - TEST_SCALETO1(width, height, None, 0) \ - TEST_SCALETO1(width, height, Bilinear, 2) \ - TEST_SCALETO1(width, height, Box, 2) \ - -TEST_SCALETO(640, 360) -TEST_SCALETO(853, 480) -TEST_SCALETO(1280, 720) -TEST_SCALETO(1280, 800) -TEST_SCALETO(1366, 768) -TEST_SCALETO(1920, 1080) +// Test scale to a specified size with all 4 filters. +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(name, width, height, None, 0) \ + TEST_SCALETO1(name, width, height, Linear, 3) \ + TEST_SCALETO1(name, width, height, Bilinear, 3) \ + TEST_SCALETO1(name, width, height, Box, 3) + +TEST_SCALETO(Scale, 1, 1) +TEST_SCALETO(Scale, 320, 240) +TEST_SCALETO(Scale, 352, 288) +TEST_SCALETO(Scale, 640, 360) +TEST_SCALETO(Scale, 853, 480) +TEST_SCALETO(Scale, 1280, 720) +TEST_SCALETO(Scale, 1920, 1080) #undef TEST_SCALETO1 #undef TEST_SCALETO diff --git a/chromium/third_party/libyuv/unit_test/unit_test.cc b/chromium/third_party/libyuv/unit_test/unit_test.cc index fac70262133..b11bd246313 100644 --- a/chromium/third_party/libyuv/unit_test/unit_test.cc +++ b/chromium/third_party/libyuv/unit_test/unit_test.cc @@ -19,8 +19,8 @@ #define BENCHMARK_ITERATIONS 1 libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128), - benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(128), - benchmark_height_(72) { + benchmark_iterations_(BENCHMARK_ITERATIONS), benchmark_width_(22), + benchmark_height_(14) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT @@ -39,9 +39,14 @@ libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128), if (height) { benchmark_height_ = atoi(height); // NOLINT } - benchmark_pixels_div256_ = static_cast<int>( - (static_cast<double>(benchmark_width_ * - benchmark_height_) * benchmark_iterations_ + 255.0) / 256.0); + benchmark_pixels_div256_ = static_cast<int>(( + static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + 255.0) / 256.0); + benchmark_pixels_div1280_ = static_cast<int>(( + static_cast<double>(Abs(benchmark_width_)) * + static_cast<double>(Abs(benchmark_height_)) * + static_cast<double>(benchmark_iterations_) + 1279.0) / 1280.0); } int main(int argc, char** argv) { diff --git a/chromium/third_party/libyuv/unit_test/unit_test.h b/chromium/third_party/libyuv/unit_test/unit_test.h index e81aea30780..89b333bdd59 100644 --- a/chromium/third_party/libyuv/unit_test/unit_test.h +++ b/chromium/third_party/libyuv/unit_test/unit_test.h @@ -11,10 +11,21 @@ #ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT #define UNIT_TEST_UNIT_TEST_H_ +#ifdef WIN32 +#include <windows.h> +#else +#include <sys/time.h> +#include <sys/resource.h> +#endif + #include <gtest/gtest.h> #include "libyuv/basic_types.h" +static __inline int Abs(int v) { + return v >= 0 ? v : -v; +} + #define align_buffer_64(var, size) \ uint8* var; \ uint8* var##_mem; \ @@ -38,7 +49,6 @@ var = 0; #ifdef WIN32 -#include <windows.h> static inline double get_time() { LARGE_INTEGER t, f; QueryPerformanceCounter(&t); @@ -49,10 +59,6 @@ static inline double get_time() { #define random rand #define srandom srand #else - -#include <sys/time.h> -#include <sys/resource.h> - static inline double get_time() { struct timeval t; struct timezone tzp; @@ -63,9 +69,9 @@ static inline double get_time() { static inline void MemRandomize(uint8* dst, int len) { int i; - for (i = 0; i < len - 3; i += 4) { - *reinterpret_cast<uint32*>(dst) = random(); - dst += 4; + for (i = 0; i < len - 1; i += 2) { + *reinterpret_cast<uint16*>(dst) = random(); + dst += 2; } for (; i < len; ++i) { *dst++ = random(); @@ -83,6 +89,7 @@ class libyuvTest : public ::testing::Test { int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div256_; // Total pixels to benchmark / 256. + int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. }; #endif // UNIT_TEST_UNIT_TEST_H_ NOLINT diff --git a/chromium/third_party/libyuv/util/convert.cc b/chromium/third_party/libyuv/util/convert.cc index 18316ef8efb..5f071416da4 100644 --- a/chromium/third_party/libyuv/util/convert.cc +++ b/chromium/third_party/libyuv/util/convert.cc @@ -155,8 +155,8 @@ void ParseOptions(int argc, const char* argv[]) { } } -static const int kTileX = 12; -static const int kTileY = 8; +static const int kTileX = 32; +static const int kTileY = 32; static int TileARGBScale(const uint8* src_argb, int src_stride_argb, int src_width, int src_height, diff --git a/chromium/third_party/libyuv/util/cpuid.c b/chromium/third_party/libyuv/util/cpuid.c index 8d8529ba7c6..db22871ea50 100644 --- a/chromium/third_party/libyuv/util/cpuid.c +++ b/chromium/third_party/libyuv/util/cpuid.c @@ -25,7 +25,7 @@ int main(int argc, const char* argv[]) { #if defined(__i386__) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_X64) if (has_x86) { - int family, model, cpu_info[4]; + uint32 family, model, cpu_info[4]; // Vendor ID: // AuthenticAMD AMD processor // CentaurHauls Centaur processor @@ -37,7 +37,7 @@ int main(int argc, const char* argv[]) { // RiseRiseRise Rise Technology processor // SiS SiS SiS SiS processor // UMC UMC UMC UMC processor - CpuId(cpu_info, 0); + CpuId(0, 0, &cpu_info[0]); cpu_info[0] = cpu_info[1]; // Reorder output cpu_info[1] = cpu_info[3]; cpu_info[3] = 0; @@ -50,7 +50,7 @@ int main(int argc, const char* argv[]) { // 13:12 - Processor Type // 19:16 - Extended Model // 27:20 - Extended Family - CpuId(cpu_info, 1); + CpuId(1, 0, &cpu_info[0]); family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, @@ -79,6 +79,7 @@ int main(int argc, const char* argv[]) { int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); + int has_fma3 = TestCpuFlag(kCpuHasFMA3); printf("Has SSE2 %x\n", has_sse2); printf("Has SSSE3 %x\n", has_ssse3); printf("Has SSE4.1 %x\n", has_sse41); @@ -86,6 +87,7 @@ int main(int argc, const char* argv[]) { printf("Has AVX %x\n", has_avx); printf("Has AVX2 %x\n", has_avx2); printf("Has ERMS %x\n", has_erms); + printf("Has FMA3 %x\n", has_fma3); } return 0; } diff --git a/chromium/third_party/libyuv/util/psnr.h b/chromium/third_party/libyuv/util/psnr.h index 2cd0b1457ce..370337a75f2 100644 --- a/chromium/third_party/libyuv/util/psnr.h +++ b/chromium/third_party/libyuv/util/psnr.h @@ -10,7 +10,7 @@ // Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format -#ifndef UTIL_PSNR_H_ +#ifndef UTIL_PSNR_H_ // NOLINT #define UTIL_PSNR_H_ #ifdef __cplusplus @@ -36,4 +36,4 @@ double ComputeSumSquareError(const uint8* org, const uint8* rec, int size); } // extern "C" #endif -#endif // UTIL_PSNR_H_ +#endif // UTIL_PSNR_H_ // NOLINT diff --git a/chromium/third_party/libyuv/util/ssim.cc b/chromium/third_party/libyuv/util/ssim.cc index 277561dd00d..d07889a8ac8 100644 --- a/chromium/third_party/libyuv/util/ssim.cc +++ b/chromium/third_party/libyuv/util/ssim.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./ssim.h" +#include "../util/ssim.h" // NOLINT #include <math.h> #include <string.h> diff --git a/chromium/third_party/libyuv/util/ssim.h b/chromium/third_party/libyuv/util/ssim.h index 0689276addc..40120b4f4e7 100644 --- a/chromium/third_party/libyuv/util/ssim.h +++ b/chromium/third_party/libyuv/util/ssim.h @@ -10,7 +10,7 @@ // Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format -#ifndef UTIL_SSIM_H_ +#ifndef UTIL_SSIM_H_ // NOLINT #define UTIL_SSIM_H_ #ifdef __cplusplus @@ -32,4 +32,4 @@ double CalcLSSIM(double ssim); } // extern "C" #endif -#endif // UTIL_SSIM_H_ +#endif // UTIL_SSIM_H_ // NOLINT diff --git a/chromium/third_party/libyuv/winarm.mk b/chromium/third_party/libyuv/winarm.mk new file mode 100644 index 00000000000..2638608ebc9 --- /dev/null +++ b/chromium/third_party/libyuv/winarm.mk @@ -0,0 +1,43 @@ +# This is a generic makefile for libyuv for Windows Arm. +# nmake /f winarm.mk +# make -f winarm.mk +# nmake /f winarm.mk clean +# consider /arch:ARMv7VE +CC=cl +CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP +AR=lib +ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE +RM=cmd /c del + +LOCAL_OBJ_FILES = \ + source/compare.o\ + source/compare_common.o\ + source/convert.o\ + source/convert_argb.o\ + source/convert_from.o\ + source/convert_from_argb.o\ + source/convert_to_argb.o\ + source/convert_to_i420.o\ + source/cpu_id.o\ + source/format_conversion.o\ + source/planar_functions.o\ + source/rotate.o\ + source/rotate_argb.o\ + source/row_any.o\ + source/row_common.o\ + source/scale.o\ + source/scale_argb.o\ + source/scale_common.o\ + source/video_common.o + +.cc.o: + $(CC) /c $(CCFLAGS) $*.cc /Fo$@ + +all: libyuv_arm.lib winarm.mk + +libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk + $(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES) + +clean: + $(RM) "source\*.o" libyuv_arm.lib + |